Teachbot / models /model.py
OMGJ's picture
Upload 11 files
d9da15c verified
raw
history blame
4.39 kB
import json
import string
import random
import nltk
import os
import numpy as np
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from keras import Sequential
from keras.layers import Dense, Dropout
from nltk.tokenize import word_tokenize
"""
nltk.download('omw-1.4')
nltk.download("stopwords", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)
"""
class ModeleDeepLearning:
def __init__(self, file_path, epochs=200):
self.file_path = file_path
self.epochs = epochs
self.model = None
self.words = []
self.classes = []
self.lemmatizer = WordNetLemmatizer()
self.run()
def importer(self):
with open(self.file_path, encoding="utf-8") as f:
data = json.load(f)
return data
def preprocess_data(self):
doc_X = []
doc_y = []
for intent in self.data["intents"]:
for pattern in intent["patterns"]:
tokens = word_tokenize(pattern)
self.words.extend(tokens)
doc_X.append(pattern)
doc_y.append(intent["tag"])
if intent["tag"] not in self.classes:
self.classes.append(intent["tag"])
self.words = [self.lemmatizer.lemmatize(word.lower()) for word in self.words if word not in string.punctuation]
self.words = sorted(set(self.words))
self.classes = sorted(set(self.classes))
training = []
out_empty = [0] * len(self.classes)
for idx, doc in enumerate(doc_X):
bow = []
text = self.lemmatizer.lemmatize(doc.lower())
for word in self.words:
bow.append(1) if word in text else bow.append(0)
output_row = list(out_empty)
output_row[self.classes.index(doc_y[idx])] = 1
training.append([bow, output_row])
random.shuffle(training)
training = np.array(training, dtype=object)
train_X = np.array(list(training[:, 0]))
train_y = np.array(list(training[:, 1]))
return train_X, train_y
def build_model(self, input_shape, output_shape):
model = Sequential()
model.add(Dense(128, input_shape=input_shape, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(output_shape, activation="softmax"))
adam = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=["accuracy"])
return model
def train_model(self):
input_shape = (len(self.train_X[0]),)
output_shape = len(self.train_y[0])
self.model = self.build_model(input_shape, output_shape)
self.model.fit(x=self.train_X, y=self.train_y, epochs=self.epochs, verbose=1)
def clean_text(self, text):
tokens = word_tokenize(text)
tokens = [self.lemmatizer.lemmatize(word) for word in tokens]
return tokens
def bag_of_words(self, text):
tokens = self.clean_text(text)
bow = [0] * len(self.words)
for w in tokens:
for idx, word in enumerate(self.words):
if word == w:
bow[idx] = 1
return np.array(bow)
def predict_class(self, text):
bow = self.bag_of_words(text)
result = self.model.predict(np.array([bow]))[0]
thresh = 0.2
y_pred = [[idx, res] for idx, res in enumerate(result) if res > thresh]
y_pred.sort(key=lambda x: x[1], reverse=True)
return_list = [self.classes[r[0]] for r in y_pred]
return return_list
def get_response(self, intents_list):
tag = intents_list[0]
list_of_intents = self.data["intents"]
for i in list_of_intents:
if i["tag"] == tag:
result = random.choice(i["responses"])
break
return result
def predict(self, question):
intents = self.predict_class(question)
return self.get_response(intents)
def run(self):
self.data = self.importer()
self.train_X, self.train_y = self.preprocess_data()
self.train_model()