Anton
f
d0a9720
raw
history blame
4.27 kB
import streamlit as st
import numpy as np
import time
import pickle
import torch
import pandas as pd
from gensim.models import KeyedVectors
from transformers import BertTokenizer, BertModel
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from function.lstm_preprocessing import (
clean,
tokin,
predict_ml_class,
predict_sentence,
predict_single_string,
LSTMClassifier
)
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
stemmer = SnowballStemmer('russian')
sw = stopwords.words('russian')
EMBEDDING_DIM = 32
HIDDEN_DIM = 32
SEQ_LEN = 200
VOCAB_SIZE = 196906
EMBEDDING_DIM = 32
wv = KeyedVectors.load("file/wv.wordvectors", mmap='r')
with open('file/vocab_to_int.txt', 'rb') as f:
vocab_to_int = pickle.load(f)
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in vocab_to_int.items():
try:
embedding_vector = wv[word]
embedding_matrix[i] = embedding_vector
except KeyError as e:
pass
embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
model = LSTMClassifier(embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_DIM, embedding=embedding_layer).to(DEVICE)
model.load_state_dict(torch.load('models/LTSM_model_epoch_7.pt', map_location='cpu'))
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model_BERT = BertModel.from_pretrained("bert-base-multilingual-cased")
loaded_model = pickle.load(open('models/LogReg.pickle', "rb"))
loaded_classifier = pickle.load(open('models/trained_model.pkl', "rb"))
loaded_vectorizer = pickle.load(open('models/vectorizer.pkl', "rb"))
def main():
st.title("Классификация отзыва на поликлиники")
user_input = st.text_area("Введите ваш отзыв:", "")
return user_input
user_input = main()
def predict_lstm(user_input):
start_time = time.time()
prediction = predict_sentence(user_input, model, SEQ_LEN, vocab_to_int)
end_time = time.time()
return prediction, round((end_time - start_time), 4)
def predict_bert(user_input):
start_time = time.time()
prediction = predict_single_string(user_input, model_BERT, loaded_model)
end_time = time.time()
return prediction, round((end_time - start_time), 4)
def predict_ML(user_input):
start_time = time.time()
prediction = predict_ml_class(user_input, loaded_vectorizer, loaded_classifier)
end_time = time.time()
return prediction, round((end_time - start_time), 4)
if user_input:
prediction_rnn, time_taken_rnn = predict_ML(user_input)
st.write("### Bag-of-Words + LogReg")
st.write("Предсказанный класс:", prediction_rnn)
st.write("Время предсказания:", time_taken_rnn, "сек.")
prediction_rnn, time_taken_rnn = predict_lstm(user_input)
st.write("### LSTM модель")
st.write("Предсказанный класс:", prediction_rnn)
st.write("Время предсказания:", time_taken_rnn, "сек.")
prediction_rnn, time_taken_rnn = predict_bert(user_input)
st.write("### BERT модель + LogReg")
st.write("Предсказанный класс:", prediction_rnn)
st.write("Время предсказания:", time_taken_rnn, "сек.")
st.sidebar.image('images/polyclinic.jpeg', use_column_width=True)
f1_score_classic_ml = 0.87
f1_score_rnn = 0.88
f1_score_bert = 0.83
f1_score_classic_ml_valid = 0.89
f1_score_rnn_valid = 0.92
f1_score_bert_valid = 0.82
# Создание DataFrame для сравнения результатов
st.sidebar.write("### Сравнительная таблица по метрике f1-macro")
results = {
"Модель": ["Классический ML", "LSTM", "BERT-based"],
"train": [f1_score_classic_ml, f1_score_rnn, f1_score_bert],
"valid": [f1_score_classic_ml_valid, f1_score_rnn_valid, f1_score_bert_valid]
}
results_df = pd.DataFrame(results)
st.sidebar.dataframe(results_df)