Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
import pickle | |
import nltk | |
from nltk import word_tokenize | |
from nltk.util import ngrams | |
from unidecode import unidecode | |
nltk.download('punkt') | |
# leemos diccionario de entidades | |
diccionario = pd.read_excel('diccionario.xlsx') | |
diccionario = diccionario.iloc[1:] | |
all_dicts = diccionario.apply(lambda x: {x['Entidad']: x['Categoria']}, axis = 1) | |
# formateamos diccionario | |
entities_dict = {} | |
for i in all_dicts: | |
entities_dict.update(i) | |
def predict(text): | |
diccionario = entities_dict.copy() | |
tokens = word_tokenize(text, language = 'spanish') | |
tokens_lower = [unidecode(token.lower()) for token in tokens] # tokens en minuscula | |
dict_tokens = {tokens_lower[i]: tokens[i] for i in range(len(tokens))} | |
dict_keys = {unidecode(key.lower()): key for key in diccionario.keys()} | |
# presencia de ngrams | |
ngram_range = 5 # rango de ngramas a evaluar | |
nmin = 1 # numero minimo de ngramas presente en el texto | |
grams_detected = {} | |
for i in range(2, ngram_range + 1): | |
n_grams = [' '.join(ngram) for ngram in list(nltk.ngrams(tokens_lower, i))] | |
intersection = list(set(n_grams) & set(dict_keys.keys())) | |
if len(intersection) > 0: | |
nmin = i | |
grams_detected.update({nmin: intersection}) | |
sep = '%$路' | |
tmp_text = text | |
for i in range(5, 1, -1): | |
try: | |
# obtener todos los ngramas de nivel "i" | |
for j in range(len(grams_detected[i])): | |
tmp_text = tmp_text.replace(grams_detected[i][j], f'{i}{sep}{j}') | |
except KeyError: # en caso de que no existan ngramas de nivel "i", pass | |
pass | |
labeled_tokens = [] | |
#聽si hay solo entidades de largo 1, devuelvo oracion etiquetada token a token | |
if nmin < 2: | |
for token in tokens_lower: | |
labeled_tokens.append((dict_tokens[token], diccionario[dict_keys[token]]) if token in dict_keys.keys() else (token, None)) | |
# si hay entidades de largo 2 o mas, devuelvo solo las entidades etiquetadas | |
else: | |
tmp_text = ' '.join(tmp_text.split()) #聽texto sin espacios | |
tmp_tokens = tmp_text.split() | |
for token in tmp_tokens: | |
if sep in token: | |
level, pos = token.split(sep) | |
encoded_token = grams_detected[int(level)][int(pos)] | |
labeled_tokens.append((encoded_token, diccionario[dict_keys[encoded_token]])) | |
elif token in dict_keys.keys(): | |
labeled_tokens.append((dict_tokens[token], diccionario[dict_keys[token]])) | |
else: | |
labeled_tokens.append((token, None)) | |
# SERNAC CLASSIFICATION | |
with open('sernac_model.pkl', 'rb') as model: | |
clf = pickle.load(model) | |
labels = [label for label in clf.classes_] | |
sernac_probas = clf.predict_proba([text]) | |
sernac_probas = {labels[i]: float(sernac_probas[0][i]) for i in range(sernac_probas.shape[1])} | |
# SERNAC CATEGORIES CLASSIFICATION | |
with open('sernac_categories_model.pkl', 'rb') as model: | |
clf = pickle.load(model) | |
labels = [label for label in clf.classes_] | |
probas = clf.predict_proba([text]) | |
sernac_categories = {labels[i]: float(probas[0][i]) for i in range(probas.shape[1])} | |
return labeled_tokens, sernac_probas, sernac_categories | |
# DEMO | |
demo = gr.Interface( | |
predict, | |
inputs = gr.Textbox(placeholder = "Ingresa el texto ac谩", label = 'Texto'), | |
outputs = [gr.Highlightedtext(label = 'Etiquetas'), gr.outputs.Label(label = 'Clasificaci贸n Sernac'), gr.outputs.Label(label = 'Clasificaci贸n Categor铆as Sernac')], | |
examples=[ | |
['este septiembre iremos manejando a temuco en un toyota para pasar las fiestas patrias'], | |
['no puedo, tengo que irme desde san pedro hasta la reina y luego hasta san pedro de la paz'], | |
['Buenas tardes, hace unas semanas compre un suzuki swift a derco de santiago, llevaba 2 semanas y la caja de cambios se ech贸 a perder. Tengo asegurado el auto con BCI, pero aun no obtengo respuesta. '] | |
], | |
title = 'Detecci贸n de Entidades' | |
) | |
demo.launch() |