Added classification models for subcategories
Browse files- app.py +134 -25
- production_models/cliente/cliente_model.pkl +0 -0
- production_models/cliente/lstm_atencionalcliente.pt +0 -0
- production_models/conforme/conforme_model.pkl +0 -0
- production_models/conforme/lstm_conforme.pt +0 -0
- production_models/devoluciones/devoluciones_model.pkl +0 -0
- production_models/devoluciones/lstm_devoluciones.pt +0 -0
- production_models/entrega/entrega_model.pkl +0 -0
- production_models/entrega/lstm_entrega.pt +0 -0
- production_models/financiamiento/financiamiento_model.pkl +0 -0
- production_models/financiamiento/lstm_financiamiento.pt +0 -0
- production_models/marketing/lstm_trademarketing.pt +0 -0
- production_models/marketing/marketing_model.pkl +0 -0
- production_models/otros/lstm_otros.pt +0 -0
- production_models/otros/otros_model.pkl +0 -0
- production_models/stock/lstm_stock.pt +0 -0
- production_models/stock/stock_model.pkl +0 -0
- production_models/ventas/lstm_ventas.pt +0 -0
- production_models/ventas/ventas_model.pkl +0 -0
- utils/load_model.py +27 -0
- utils/lstm.py +37 -0
- utils/production_model +36 -0
- utils/tokenizer.py +167 -0
app.py
CHANGED
@@ -1,40 +1,149 @@
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
from transformers import pipeline
|
|
|
|
|
|
|
|
|
4 |
|
5 |
-
#
|
|
|
|
|
6 |
pipeline_clf = pipeline("text-classification", model = "stinoco/beto-sentiment-analysis-finetuned", return_all_scores = True)
|
7 |
pipeline_pos = pipeline("token-classification", model = "sagorsarker/codeswitch-spaeng-pos-lince")
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
'''
|
13 |
-
Función que recibe texto como input, devuelve la clasificación de texto para ser recibida por el demo.
|
14 |
-
text: texto a clasificar (str)
|
15 |
-
'''
|
16 |
-
|
17 |
# Text Classification
|
18 |
classes = pipeline_clf(text)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
21 |
-
|
22 |
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
|
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
['un solo vendedor no pude estar encargado de miles de articulos debe especificarse en cerveza'],
|
35 |
-
['no hay mercaderia']
|
36 |
-
],
|
37 |
-
title = 'Demo Clasificación NPS'
|
38 |
-
)
|
39 |
|
40 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import numpy as np
|
3 |
from transformers import pipeline
|
4 |
+
from utils.tokenizer import tokenizer
|
5 |
+
from utils.lstm import lstm
|
6 |
+
from utils.load_model import load_model
|
7 |
+
from utils.production_model import ProductionModel
|
8 |
|
9 |
+
# Cargamos modelos
|
10 |
+
|
11 |
+
## Transformers
|
12 |
pipeline_clf = pipeline("text-classification", model = "stinoco/beto-sentiment-analysis-finetuned", return_all_scores = True)
|
13 |
pipeline_pos = pipeline("token-classification", model = "sagorsarker/codeswitch-spaeng-pos-lince")
|
14 |
|
15 |
+
## LSTM
|
16 |
+
clf_marketing = load_model('marketing')
|
17 |
+
clf_cliente = load_model('cliente')
|
18 |
+
clf_conforme = load_model('conforme')
|
19 |
+
clf_devoluciones = load_model('devoluciones')
|
20 |
+
clf_entrega = load_model('entrega')
|
21 |
+
clf_financiamiento = load_model('financiamiento')
|
22 |
+
clf_otros = load_model('otros')
|
23 |
+
clf_stock = load_model('stock')
|
24 |
+
clf_ventas = load_model('ventas')
|
25 |
|
26 |
+
# PREDICT
|
27 |
+
def predict(text):
|
|
|
|
|
|
|
|
|
|
|
28 |
# Text Classification
|
29 |
classes = pipeline_clf(text)[0]
|
30 |
+
macro_probas = {element['label']: element['score'] for element in classes}
|
31 |
+
macro_probas = dict(sorted(macro_probas.items(), key=lambda x: x[1], reverse = True)[:4])
|
32 |
+
macro_probas['Resto'] = 1 - sum(macro_probas.values())
|
33 |
+
|
34 |
+
macro_label = max(macro_probas, key = macro_probas.get)
|
35 |
+
macro_labels = macro_label.split(' - ')
|
36 |
+
|
37 |
+
output = {macro_output: macro_probas, cliente_component: None, conforme_component: None,
|
38 |
+
devoluciones_component: None, entrega_component: None, financiamiento_component: None,
|
39 |
+
otros_component: None, stock_component: None, marketing_component: None,
|
40 |
+
ventas_component: None, row_cliente: gr.update(visible = False),
|
41 |
+
row_conforme: gr.update(visible = False), row_devoluciones: gr.update(visible = False),
|
42 |
+
row_entrega: gr.update(visible = False), row_financiamiento: gr.update(visible = False),
|
43 |
+
row_otros: gr.update(visible = False), row_stock: gr.update(visible = False),
|
44 |
+
row_marketing: gr.update(visible = False), row_ventas: gr.update(visible = False),}
|
45 |
+
|
46 |
+
if 'Atención al cliente' in macro_labels:
|
47 |
+
output[row_cliente] = gr.update(visible = True)
|
48 |
+
output[cliente_component] = clf_cliente.predict([text])
|
49 |
+
|
50 |
+
if 'Conforme' in macro_labels:
|
51 |
+
output[row_conforme] = gr.update(visible = True)
|
52 |
+
output[conforme_component] = clf_conforme.predict([text])
|
53 |
+
|
54 |
+
if 'Devoluciones' in macro_labels:
|
55 |
+
output[row_devoluciones] = gr.update(visible = True)
|
56 |
+
output[devoluciones_component] = clf_devoluciones.predict([text])
|
57 |
+
|
58 |
+
if 'Entrega' in macro_labels:
|
59 |
+
output[row_entrega] = gr.update(visible = True)
|
60 |
+
output[entrega_component] = clf_entrega.predict([text])
|
61 |
+
|
62 |
+
if 'Financiamiento' in macro_labels:
|
63 |
+
output[row_financiamiento] = gr.update(visible = True)
|
64 |
+
output[financiamiento_component] = clf_financiamiento.predict([text])
|
65 |
+
|
66 |
+
if 'Otros' in macro_labels:
|
67 |
+
output[row_otros] = gr.update(visible = True)
|
68 |
+
output[otros_component] = clf_otros.predict([text])
|
69 |
+
|
70 |
+
if 'Stock' in macro_labels:
|
71 |
+
output[row_stock] = gr.update(visible = True)
|
72 |
+
output[stock_component] = clf_stock.predict([text])
|
73 |
+
|
74 |
+
if 'Trade Marketing' in macro_labels:
|
75 |
+
output[row_marketing] = gr.update(visible = True)
|
76 |
+
output[marketing_component] = clf_marketing.predict([text])
|
77 |
+
|
78 |
+
if 'Ventas' in macro_labels:
|
79 |
+
output[row_ventas] = gr.update(visible = True)
|
80 |
+
output[ventas_component] = clf_ventas.predict([text])
|
81 |
+
|
82 |
+
return output
|
83 |
+
|
84 |
+
|
85 |
+
# DEMO
|
86 |
+
with gr.Blocks(title = 'Modelo NPS') as demo:
|
87 |
+
|
88 |
+
gr.Markdown(
|
89 |
+
'''
|
90 |
+
# <center>Modelo de Clasificación NPS</center>
|
91 |
+
Este es un modelo para categorizar reclamos de NPS, prueba escribiendo reclamos abajo!
|
92 |
+
''')
|
93 |
+
|
94 |
+
with gr.Column() as text_col:
|
95 |
+
with gr.Row():
|
96 |
+
text_input = gr.Textbox(placeholder = "Ingresa el reclamo acá", label = 'Reclamo')
|
97 |
+
#macro_output = gr.outputs.Label(label = 'Categorías Generales')
|
98 |
+
|
99 |
+
with gr.Row():
|
100 |
+
macro_output = gr.outputs.Label(label = 'Categorías Generales')
|
101 |
+
|
102 |
+
with gr.Row():
|
103 |
+
#macro_output = gr.outputs.Label(label = 'Categorías Generales')
|
104 |
+
with gr.Row(visible = False) as row_cliente:
|
105 |
+
cliente_component = gr.outputs.Label(label = 'Categorías Atención al Cliente')
|
106 |
+
|
107 |
+
with gr.Row(visible = False) as row_conforme:
|
108 |
+
conforme_component = gr.outputs.Label(label = 'Categorías Conforme')
|
109 |
+
|
110 |
+
with gr.Row(visible = False) as row_devoluciones:
|
111 |
+
devoluciones_component = gr.outputs.Label(label = 'Categorías Devoluciones')
|
112 |
+
|
113 |
+
with gr.Row(visible = False) as row_entrega:
|
114 |
+
entrega_component = gr.outputs.Label(label = 'Categorías Entrega')
|
115 |
+
|
116 |
+
with gr.Row(visible = False) as row_financiamiento:
|
117 |
+
financiamiento_component = gr.outputs.Label(label = 'Categorías Financiamiento')
|
118 |
+
|
119 |
+
with gr.Row(visible = False) as row_otros:
|
120 |
+
otros_component = gr.outputs.Label(label = 'Categorías Otros')
|
121 |
+
|
122 |
+
with gr.Row(visible = False) as row_stock:
|
123 |
+
stock_component = gr.outputs.Label(label = 'Categorías Stock')
|
124 |
+
|
125 |
+
with gr.Row(visible = False) as row_marketing:
|
126 |
+
marketing_component = gr.outputs.Label(label = 'Categorías Trade Marketing')
|
127 |
|
128 |
+
with gr.Row(visible = False) as row_ventas:
|
129 |
+
ventas_component = gr.outputs.Label(label = 'Categorías Ventas')
|
130 |
|
131 |
+
outputs = [
|
132 |
+
macro_output, cliente_component, conforme_component, devoluciones_component,
|
133 |
+
entrega_component, financiamiento_component, otros_component, stock_component,
|
134 |
+
marketing_component, ventas_component, row_cliente, row_conforme,
|
135 |
+
row_devoluciones, row_entrega, row_financiamiento, row_otros,
|
136 |
+
row_stock, row_marketing, row_ventas, ]
|
137 |
|
138 |
+
button = gr.Button('Submit')
|
139 |
+
button.click(fn = predict, inputs = text_input, outputs = outputs)
|
140 |
|
141 |
+
gr.Examples(
|
142 |
+
examples = [['sale mas a cuenta comprar en los supermercados que a la cervecería'],
|
143 |
+
['llega las latas abolladas sucias'],
|
144 |
+
['vendedor no viene presencialmente solo por whatsapp'],
|
145 |
+
['mejorar la atención de los repartidores porque roban'],
|
146 |
+
['seria bueno mas promociones y publicidad']],
|
147 |
+
inputs = text_input)
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
+
demo.launch(share = True)
|
production_models/cliente/cliente_model.pkl
ADDED
Binary file (48.1 kB). View file
|
|
production_models/cliente/lstm_atencionalcliente.pt
ADDED
Binary file (701 kB). View file
|
|
production_models/conforme/conforme_model.pkl
ADDED
Binary file (16.9 kB). View file
|
|
production_models/conforme/lstm_conforme.pt
ADDED
Binary file (329 kB). View file
|
|
production_models/devoluciones/devoluciones_model.pkl
ADDED
Binary file (10.4 kB). View file
|
|
production_models/devoluciones/lstm_devoluciones.pt
ADDED
Binary file (248 kB). View file
|
|
production_models/entrega/entrega_model.pkl
ADDED
Binary file (30.4 kB). View file
|
|
production_models/entrega/lstm_entrega.pt
ADDED
Binary file (491 kB). View file
|
|
production_models/financiamiento/financiamiento_model.pkl
ADDED
Binary file (12.4 kB). View file
|
|
production_models/financiamiento/lstm_financiamiento.pt
ADDED
Binary file (275 kB). View file
|
|
production_models/marketing/lstm_trademarketing.pt
ADDED
Binary file (695 kB). View file
|
|
production_models/marketing/marketing_model.pkl
ADDED
Binary file (47.9 kB). View file
|
|
production_models/otros/lstm_otros.pt
ADDED
Binary file (406 kB). View file
|
|
production_models/otros/otros_model.pkl
ADDED
Binary file (23.3 kB). View file
|
|
production_models/stock/lstm_stock.pt
ADDED
Binary file (364 kB). View file
|
|
production_models/stock/stock_model.pkl
ADDED
Binary file (19.8 kB). View file
|
|
production_models/ventas/lstm_ventas.pt
ADDED
Binary file (473 kB). View file
|
|
production_models/ventas/ventas_model.pkl
ADDED
Binary file (29 kB). View file
|
|
utils/load_model.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pickle
|
2 |
+
from glob import glob
|
3 |
+
import torch
|
4 |
+
import os
|
5 |
+
|
6 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
7 |
+
|
8 |
+
def load_model(folder):
|
9 |
+
|
10 |
+
'''
|
11 |
+
Función que tiene por objetivo cargar un modelo de predicción.
|
12 |
+
Utiliza un modelo .pt y un objeto .pkl
|
13 |
+
folder: carpeta de la que cargar el modelo (str)
|
14 |
+
'''
|
15 |
+
|
16 |
+
base_folder = 'production_models'
|
17 |
+
folder = folder
|
18 |
+
|
19 |
+
model_path = glob(os.path.join(base_folder, folder, '*.pt'))[0]
|
20 |
+
clf_path = glob(os.path.join(base_folder, folder, '*.pkl'))[0]
|
21 |
+
|
22 |
+
with open(clf_path, 'rb') as file:
|
23 |
+
clf = pickle.load(file)
|
24 |
+
|
25 |
+
clf.model = torch.load(model_path, map_location = device)
|
26 |
+
|
27 |
+
return clf
|
utils/lstm.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
6 |
+
|
7 |
+
class LSTM(nn.Module):
|
8 |
+
def __init__(self, vocab_size, n_classes, hidden_dim, embedding_dim, n_layers, dropout, bidirectional = True):
|
9 |
+
super(LSTM, self).__init__()
|
10 |
+
|
11 |
+
self.n_layers = n_layers
|
12 |
+
self.hidden_dim = hidden_dim
|
13 |
+
self.embedding_dim = embedding_dim
|
14 |
+
|
15 |
+
# Capas embedding y LSTM
|
16 |
+
self.embedding = nn.Embedding(vocab_size, embedding_dim, device = device)
|
17 |
+
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout = dropout, batch_first = True, bidirectional = bidirectional, device = device)
|
18 |
+
|
19 |
+
# Dropout
|
20 |
+
self.dropout = nn.Dropout(dropout)
|
21 |
+
|
22 |
+
# Capa lineal
|
23 |
+
self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, n_classes, device = device)
|
24 |
+
|
25 |
+
def forward(self, x):
|
26 |
+
|
27 |
+
x = self.embedding(x)
|
28 |
+
|
29 |
+
x, hidden = self.lstm(x)
|
30 |
+
|
31 |
+
x = x[:, -1, :]
|
32 |
+
|
33 |
+
x = self.dropout(x)
|
34 |
+
|
35 |
+
output = self.fc(x)
|
36 |
+
|
37 |
+
return output, hidden
|
utils/production_model
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
4 |
+
|
5 |
+
class ProductionModel():
|
6 |
+
|
7 |
+
def __init__(self, tokenizer, dict_labels):
|
8 |
+
self.model = None
|
9 |
+
self.tokenizer = tokenizer
|
10 |
+
self.dict_labels = dict_labels
|
11 |
+
|
12 |
+
def predict(self, X):
|
13 |
+
|
14 |
+
'''
|
15 |
+
Método que genera la predicción sobre nuevos datos (X).
|
16 |
+
X: Lista con los datos, cada elemento es una observación (list)
|
17 |
+
'''
|
18 |
+
|
19 |
+
if self.model is None:
|
20 |
+
raise ValueError('Debes cargar el modelo con self.model = torch.load(model_file.pt)')
|
21 |
+
|
22 |
+
X = self.tokenizer.tokenize(X)
|
23 |
+
X = torch.tensor(X, device = device)
|
24 |
+
|
25 |
+
self.model.eval()
|
26 |
+
with torch.no_grad():
|
27 |
+
predictions = self.model(X)[0]
|
28 |
+
predictions = F.softmax(predictions, dim = 1)
|
29 |
+
predictions = predictions.to('cpu').detach().numpy()
|
30 |
+
|
31 |
+
output = [{self.dict_labels[i]: float(lista[i]) for i in range(len(lista))} for lista in predictions]
|
32 |
+
|
33 |
+
if len(output) == 1:
|
34 |
+
return output[0]
|
35 |
+
|
36 |
+
return output
|
utils/tokenizer.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
from collections import Counter
|
4 |
+
from string import punctuation
|
5 |
+
|
6 |
+
class Tokenizer():
|
7 |
+
def __init__(self):
|
8 |
+
|
9 |
+
self.vocab = None
|
10 |
+
self.pad_idx = 0
|
11 |
+
self.unk_idx = 1
|
12 |
+
|
13 |
+
def preprocessing(self, texts: list):
|
14 |
+
|
15 |
+
'''
|
16 |
+
Método que pre procesa un documento, transformando las palabras a minúsculas, eliminando puntuaciones y caracterés "\n". Devuelve los textos pre procesados.
|
17 |
+
texts: documentos a ser procesados (list)
|
18 |
+
'''
|
19 |
+
|
20 |
+
texts = [text.lower() for text in texts] # lowercase
|
21 |
+
texts = [''.join([c for c in text if c not in punctuation]) for text in texts] # delete punctuation
|
22 |
+
texts = [text.split('\n') for text in texts] # eliminate \n
|
23 |
+
texts = [' '.join(text) for text in texts]
|
24 |
+
|
25 |
+
return texts
|
26 |
+
|
27 |
+
def train(self, texts: list):
|
28 |
+
|
29 |
+
'''
|
30 |
+
Método que entrena el tokenizador, construyendo un vocabulario de tokens y su codificación respectiva.
|
31 |
+
texts: documentos que el tokenizador usa para construir el vocabulario (list)
|
32 |
+
'''
|
33 |
+
|
34 |
+
# preprocessing
|
35 |
+
texts = self.preprocessing(texts)
|
36 |
+
|
37 |
+
# joint text
|
38 |
+
megadoc = ' '.join(texts)
|
39 |
+
words = megadoc.split()
|
40 |
+
|
41 |
+
self.counts = Counter(words) # Construye un diccionario de palabras. Las claves son las palabras y los valores son la frecuencia
|
42 |
+
self.vocab = sorted(self.counts, key = self.counts.get, reverse = True) # Ordenamos la palabras por frecuencia
|
43 |
+
self.vocab_to_int = {word: ii for ii, word in enumerate(self.vocab, 2)} # Construimos diccionario para mapear palabra a número entero. Empezamos los índices en 2
|
44 |
+
self.vocab_to_int[self.unk_idx] = '<unk>' # token para palabras no reconocidas
|
45 |
+
self.vocab_to_int[self.pad_idx] = '<pad>' # token para padding
|
46 |
+
|
47 |
+
self.int_to_vocab = {value: key for key, value in self.vocab_to_int.items()}
|
48 |
+
|
49 |
+
def encode(self, texts: list):
|
50 |
+
|
51 |
+
'''
|
52 |
+
Método que usa el vocabulario construido para codificar textos. Devuelve los textos codificados.
|
53 |
+
texts: Documentos a ser codificados (list)
|
54 |
+
'''
|
55 |
+
|
56 |
+
if self.vocab_to_int is None:
|
57 |
+
raise ValueError('Debes entrenar el tokenizador primero')
|
58 |
+
|
59 |
+
encoded_text = [[self.vocab_to_int[word] if word in self.vocab_to_int.keys() else 1 for word in text.split()] for text in texts]
|
60 |
+
|
61 |
+
return encoded_text
|
62 |
+
|
63 |
+
def decode(self, texts: list):
|
64 |
+
|
65 |
+
'''
|
66 |
+
Método que usa el vocabulario construido para decodificar textos. Devuelve los textos decodificados.
|
67 |
+
texts: Documentos a ser decodificados (list)
|
68 |
+
'''
|
69 |
+
|
70 |
+
if self.vocab is None:
|
71 |
+
raise ValueError('Debes entrenar el tokenizador primero')
|
72 |
+
|
73 |
+
decoded_text = [[self.int_to_vocab[word] if word in self.int_to_vocab.keys() else 'unk' for word in text] for text in texts]
|
74 |
+
|
75 |
+
return decoded_text
|
76 |
+
|
77 |
+
def filter_text(self, encoded_text: list, encoded_labels: list, min_tokens = 1, max_tokens = 1e6):
|
78 |
+
|
79 |
+
'''
|
80 |
+
Método que filtra una colección de documentos en función de la cantidad de tokens. Devuelve la coleccion de documentos filtrados.
|
81 |
+
encoded_text: Textos codificados a ser filtrados (list)
|
82 |
+
encoded_labels: Etiquetas a filtrar en función del texto (list)
|
83 |
+
min_tokens: Cantidad mínima de tokens permitida (int)
|
84 |
+
max_tokens: Cantidad máxima de tokens permitida (int)
|
85 |
+
'''
|
86 |
+
|
87 |
+
print('Documentos antes de eliminación:', len(encoded_text))
|
88 |
+
|
89 |
+
#Extraemos los índices de todos los reviews que tienen longitud cumpliendo los filtros
|
90 |
+
filter_idx = [ii for ii, text in enumerate(encoded_text) if min_tokens <= len(text) <= max_tokens]
|
91 |
+
|
92 |
+
#Nos quedamos solo con los reviews con longitud que cumplen los filtros
|
93 |
+
encoded_text = [encoded_text[ii] for ii in filter_idx]
|
94 |
+
|
95 |
+
#Lo mismo con los labels
|
96 |
+
encoded_labels = np.array([encoded_labels[ii] for ii in filter_idx])
|
97 |
+
|
98 |
+
print('Documentos después de eliminación:', len(encoded_text))
|
99 |
+
|
100 |
+
return encoded_text, encoded_labels
|
101 |
+
|
102 |
+
def padding(self, encoded_text: list, vector_size: int):
|
103 |
+
|
104 |
+
'''
|
105 |
+
Método que hace padding a una secuencia, fijando el largo de las secuencias en un número determinado vector_size:
|
106 |
+
Las secuencias con largo mayor a vector_size, son acortadas por la derecha hasta ese valor.
|
107 |
+
Las secuencias con largo menor a vector_size, son llenadas con 0s por la izquierda hasta completar ese valor.
|
108 |
+
Retorna la secuencia modificada.
|
109 |
+
|
110 |
+
encoded_text: lista con los textos a modificar (list)
|
111 |
+
vector_size: largo de los documentos a fijar (int)
|
112 |
+
'''
|
113 |
+
|
114 |
+
features = np.zeros((len(encoded_text), vector_size), dtype = int)
|
115 |
+
|
116 |
+
for i, row in enumerate(encoded_text):
|
117 |
+
features[i, -len(row):] = np.array(row)[:vector_size]
|
118 |
+
|
119 |
+
return features
|
120 |
+
|
121 |
+
def tokenize(self, texts: list,
|
122 |
+
#vector_size: int
|
123 |
+
):
|
124 |
+
|
125 |
+
'''
|
126 |
+
Método que tokeniza documentos a partir del vocabulario construido. Devuelve los documentos codificados en un largo de tamaño vector_size
|
127 |
+
texts: Textos a ser tokenizados (list)
|
128 |
+
vector_size: Largo de los textos de salida (int)
|
129 |
+
'''
|
130 |
+
|
131 |
+
if self.vocab is None:
|
132 |
+
raise ValueError('Debes entrenar el tokenizador primero')
|
133 |
+
|
134 |
+
if self.vector_size is None:
|
135 |
+
raise ValueError('Debes especificar vector_size en objeto Tokenizer (tokenizer.vector_size = x)')
|
136 |
+
|
137 |
+
# preprocessing
|
138 |
+
texts = self.preprocessing(texts)
|
139 |
+
|
140 |
+
# encode
|
141 |
+
encoded_text = self.encode(texts)
|
142 |
+
|
143 |
+
# padding
|
144 |
+
features = self.padding(encoded_text, self.vector_size)
|
145 |
+
|
146 |
+
return features
|
147 |
+
|
148 |
+
def graph_distribution(self, encoded_text):
|
149 |
+
|
150 |
+
'''
|
151 |
+
Método que grafica la distribución del largo de los documentos de entrenamiento.
|
152 |
+
'''
|
153 |
+
|
154 |
+
if self.vocab is None:
|
155 |
+
raise ValueError('Debes entrenar el tokenizador primero')
|
156 |
+
|
157 |
+
text_lens = Counter([len(text) for text in encoded_text]) #Contamos cuantas palabras hay en cada review
|
158 |
+
|
159 |
+
plt.figure(figsize = (12, 6))
|
160 |
+
plt.bar(text_lens.keys(), text_lens.values())
|
161 |
+
plt.title('Distribución del largo de documentos en el Dataset')
|
162 |
+
plt.xlabel('Cantidad de tokens')
|
163 |
+
plt.ylabel('Frecuencia')
|
164 |
+
plt.show()
|
165 |
+
|
166 |
+
def __len__(self):
|
167 |
+
return len(self.vocab_to_int) if self.vocab is not None else 0
|