Spaces:
Runtime error
Runtime error
first deploy
Browse files- app.py +69 -11
- requirements.txt +3 -1
app.py
CHANGED
@@ -2,28 +2,86 @@ import gradio as gr
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import pickle
|
|
|
|
|
|
|
|
|
5 |
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
|
|
|
|
|
|
|
8 |
|
9 |
-
def predict(
|
10 |
|
11 |
-
|
|
|
|
|
12 |
|
13 |
-
|
14 |
-
|
15 |
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
demo = gr.Interface(
|
21 |
predict,
|
22 |
-
|
23 |
-
|
24 |
examples=[
|
|
|
|
|
|
|
|
|
25 |
],
|
26 |
-
title = '
|
27 |
)
|
28 |
|
29 |
-
demo.launch()
|
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import pickle
|
5 |
+
import nltk
|
6 |
+
from nltk import word_tokenize
|
7 |
+
from nltk.util import ngrams
|
8 |
+
from unidecode import unidecode
|
9 |
|
10 |
+
# leemos diccionario de entidades
|
11 |
+
diccionario = pd.read_excel('diccionario.xlsx')
|
12 |
+
diccionario = diccionario.iloc[1:]
|
13 |
+
all_dicts = diccionario.apply(lambda x: {x['Entidad']: x['Categoria']}, axis = 1)
|
14 |
|
15 |
+
# formateamos diccionario
|
16 |
+
entities_dict = {}
|
17 |
+
for i in all_dicts:
|
18 |
+
entities_dict.update(i)
|
19 |
|
20 |
+
def predict(text):
|
21 |
|
22 |
+
diccionario = entities_dict.copy()
|
23 |
+
tokens = word_tokenize(text, language = 'spanish')
|
24 |
+
tokens_lower = [unidecode(token.lower()) for token in tokens] # tokens en minuscula
|
25 |
|
26 |
+
dict_tokens = {tokens_lower[i]: tokens[i] for i in range(len(tokens))}
|
27 |
+
dict_keys = {unidecode(key.lower()): key for key in diccionario.keys()}
|
28 |
|
29 |
+
# presencia de ngrams
|
30 |
+
ngram_range = 5 # rango de ngramas a evaluar
|
31 |
+
nmin = 1 # numero minimo de ngramas presente en el texto
|
32 |
+
grams_detected = {}
|
33 |
+
for i in range(2, ngram_range + 1):
|
34 |
+
n_grams = [' '.join(ngram) for ngram in list(nltk.ngrams(tokens_lower, i))]
|
35 |
+
intersection = list(set(n_grams) & set(dict_keys.keys()))
|
36 |
+
if len(intersection) > 0:
|
37 |
+
nmin = i
|
38 |
+
grams_detected.update({nmin: intersection})
|
39 |
|
40 |
+
sep = '%$·'
|
41 |
+
tmp_text = text
|
42 |
+
for i in range(5, 1, -1):
|
43 |
+
try:
|
44 |
+
# obtener todos los ngramas de nivel "i"
|
45 |
+
for j in range(len(grams_detected[i])):
|
46 |
+
tmp_text = tmp_text.replace(grams_detected[i][j], f'{i}{sep}{j}')
|
47 |
+
except KeyError: # en caso de que no existan ngramas de nivel "i", pass
|
48 |
+
pass
|
49 |
+
|
50 |
+
labeled_tokens = []
|
51 |
+
# si hay solo entidades de largo 1, devuelvo oracion etiquetada token a token
|
52 |
+
if nmin < 2:
|
53 |
+
for token in tokens_lower:
|
54 |
+
labeled_tokens.append((dict_tokens[token], diccionario[dict_keys[token]]) if token in dict_keys.keys() else (token, None))
|
55 |
+
|
56 |
+
return labeled_tokens
|
57 |
+
|
58 |
+
# si hay entidades de largo 2 o mas, devuelvo solo las entidades etiquetadas
|
59 |
+
else:
|
60 |
+
tmp_text = ' '.join(tmp_text.split()) # texto sin espacios
|
61 |
+
tmp_tokens = tmp_text.split()
|
62 |
+
for token in tmp_tokens:
|
63 |
+
if sep in token:
|
64 |
+
level, pos = token.split(sep)
|
65 |
+
encoded_token = grams_detected[int(level)][int(pos)]
|
66 |
+
labeled_tokens.append((encoded_token, diccionario[dict_keys[encoded_token]]))
|
67 |
+
elif token in dict_keys.keys():
|
68 |
+
labeled_tokens.append((dict_tokens[token], diccionario[dict_keys[token]]))
|
69 |
+
else:
|
70 |
+
labeled_tokens.append((token, None))
|
71 |
+
|
72 |
+
return labeled_tokens
|
73 |
|
74 |
demo = gr.Interface(
|
75 |
predict,
|
76 |
+
gr.Textbox(placeholder = "Ingresa el texto acá", label = 'Texto'),
|
77 |
+
gr.Highlightedtext(label = 'Etiquetas'),
|
78 |
examples=[
|
79 |
+
['hola!! estoy en santiago manejando en mi ferrari que compré en marzo'],
|
80 |
+
['este septiembre iremos manejando a temuco en un toyota para pasar las fiestas patrias'],
|
81 |
+
['no puedo, tengo que irme desde san pedro de la paz hasta santiago'],
|
82 |
+
['no puedo, tengo que irme desde san pedro hasta la reina y luego hasta san pedro de la paz']
|
83 |
],
|
84 |
+
title = 'Detección de Entidades'
|
85 |
)
|
86 |
|
87 |
+
demo.launch()
|
requirements.txt
CHANGED
@@ -1,4 +1,6 @@
|
|
1 |
pandas
|
2 |
numpy
|
3 |
openpyxl
|
4 |
-
scikit-learn
|
|
|
|
|
|
1 |
pandas
|
2 |
numpy
|
3 |
openpyxl
|
4 |
+
scikit-learn
|
5 |
+
nltk
|
6 |
+
unidecode
|