Spaces:
Runtime error
Runtime error
Anton
commited on
Commit
•
d0a9720
1
Parent(s):
c48c4b6
- pages/mayakovsky.py +64 -0
- pages/polyclinics.py +115 -0
- pages/pushkin.py +64 -0
- pages/toxic.py +36 -0
pages/mayakovsky.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import textwrap
|
3 |
+
import torch
|
4 |
+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
5 |
+
|
6 |
+
DEVICE = torch.device("cpu")
|
7 |
+
# Load GPT-2 model and tokenizer
|
8 |
+
tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')
|
9 |
+
model_finetuned = GPT2LMHeadModel.from_pretrained(
|
10 |
+
'sberbank-ai/rugpt3small_based_on_gpt2',
|
11 |
+
output_attentions = False,
|
12 |
+
output_hidden_states = False,
|
13 |
+
)
|
14 |
+
if torch.cuda.is_available():
|
15 |
+
model_finetuned.load_state_dict(torch.load('models/mayakovsky.pt'))
|
16 |
+
else:
|
17 |
+
model_finetuned.load_state_dict(torch.load('models/mayakovsky.pt', map_location=torch.device('cpu')))
|
18 |
+
model_finetuned.eval()
|
19 |
+
|
20 |
+
# Function to generate text
|
21 |
+
def generate_text(prompt, temperature, top_p, max_length, top_k):
|
22 |
+
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
23 |
+
|
24 |
+
with torch.no_grad():
|
25 |
+
out = model_finetuned.generate(
|
26 |
+
input_ids,
|
27 |
+
do_sample=True,
|
28 |
+
num_beams=5,
|
29 |
+
temperature=temperature,
|
30 |
+
top_p=top_p,
|
31 |
+
max_length=max_length,
|
32 |
+
top_k=top_k,
|
33 |
+
no_repeat_ngram_size=3,
|
34 |
+
num_return_sequences=1,
|
35 |
+
)
|
36 |
+
|
37 |
+
generated_text = list(map(tokenizer.decode, out))
|
38 |
+
return generated_text
|
39 |
+
|
40 |
+
# Streamlit app
|
41 |
+
def main():
|
42 |
+
st.title("Генерация текста GPT-моделью в стиле В.В. Маяковского")
|
43 |
+
|
44 |
+
# User inputs
|
45 |
+
prompt = st.text_area("Введите начало текста")
|
46 |
+
temperature = st.slider("Temperature", min_value=0.2, max_value=2.5, value=1.8, step=0.1)
|
47 |
+
top_p = st.slider("Top-p", min_value=0.1, max_value=1.0, value=0.9, step=0.1)
|
48 |
+
max_length = st.slider("Max Length", min_value=10, max_value=300, value=100, step=10)
|
49 |
+
top_k = st.slider("Top-k", min_value=1, max_value=500, value=500, step=10)
|
50 |
+
num_return_sequences = st.slider("Number of Sequences", min_value=1, max_value=5, value=1, step=1)
|
51 |
+
|
52 |
+
if st.button("Generate Text"):
|
53 |
+
st.subheader("Generated Text:")
|
54 |
+
for i in range(num_return_sequences):
|
55 |
+
generated_text = generate_text(prompt, temperature, top_p, max_length, top_k)
|
56 |
+
st.write(f"Generated Text {i + 1}:")
|
57 |
+
wrapped_text = textwrap.fill(generated_text[0], width=80)
|
58 |
+
st.write(wrapped_text)
|
59 |
+
st.write("------------------")
|
60 |
+
|
61 |
+
st.sidebar.image('images/mayakovsky.jpeg', use_column_width=True)
|
62 |
+
|
63 |
+
if __name__ == "__main__":
|
64 |
+
main()
|
pages/polyclinics.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
import time
|
4 |
+
import pickle
|
5 |
+
import torch
|
6 |
+
import pandas as pd
|
7 |
+
from gensim.models import KeyedVectors
|
8 |
+
from transformers import BertTokenizer, BertModel
|
9 |
+
from nltk.corpus import stopwords
|
10 |
+
from nltk.stem import SnowballStemmer
|
11 |
+
from function.lstm_preprocessing import (
|
12 |
+
clean,
|
13 |
+
tokin,
|
14 |
+
predict_ml_class,
|
15 |
+
predict_sentence,
|
16 |
+
predict_single_string,
|
17 |
+
LSTMClassifier
|
18 |
+
)
|
19 |
+
|
20 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
21 |
+
|
22 |
+
stemmer = SnowballStemmer('russian')
|
23 |
+
sw = stopwords.words('russian')
|
24 |
+
|
25 |
+
EMBEDDING_DIM = 32
|
26 |
+
HIDDEN_DIM = 32
|
27 |
+
SEQ_LEN = 200
|
28 |
+
VOCAB_SIZE = 196906
|
29 |
+
EMBEDDING_DIM = 32
|
30 |
+
wv = KeyedVectors.load("file/wv.wordvectors", mmap='r')
|
31 |
+
|
32 |
+
with open('file/vocab_to_int.txt', 'rb') as f:
|
33 |
+
vocab_to_int = pickle.load(f)
|
34 |
+
|
35 |
+
embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
|
36 |
+
|
37 |
+
for word, i in vocab_to_int.items():
|
38 |
+
try:
|
39 |
+
embedding_vector = wv[word]
|
40 |
+
embedding_matrix[i] = embedding_vector
|
41 |
+
except KeyError as e:
|
42 |
+
pass
|
43 |
+
|
44 |
+
embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
|
45 |
+
|
46 |
+
model = LSTMClassifier(embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_DIM, embedding=embedding_layer).to(DEVICE)
|
47 |
+
model.load_state_dict(torch.load('models/LTSM_model_epoch_7.pt', map_location='cpu'))
|
48 |
+
|
49 |
+
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
|
50 |
+
model_BERT = BertModel.from_pretrained("bert-base-multilingual-cased")
|
51 |
+
|
52 |
+
loaded_model = pickle.load(open('models/LogReg.pickle', "rb"))
|
53 |
+
|
54 |
+
loaded_classifier = pickle.load(open('models/trained_model.pkl', "rb"))
|
55 |
+
loaded_vectorizer = pickle.load(open('models/vectorizer.pkl', "rb"))
|
56 |
+
|
57 |
+
def main():
|
58 |
+
st.title("Классификация отзыва на поликлиники")
|
59 |
+
user_input = st.text_area("Введите ваш отзыв:", "")
|
60 |
+
return user_input
|
61 |
+
|
62 |
+
user_input = main()
|
63 |
+
|
64 |
+
def predict_lstm(user_input):
|
65 |
+
start_time = time.time()
|
66 |
+
prediction = predict_sentence(user_input, model, SEQ_LEN, vocab_to_int)
|
67 |
+
end_time = time.time()
|
68 |
+
return prediction, round((end_time - start_time), 4)
|
69 |
+
|
70 |
+
def predict_bert(user_input):
|
71 |
+
start_time = time.time()
|
72 |
+
prediction = predict_single_string(user_input, model_BERT, loaded_model)
|
73 |
+
end_time = time.time()
|
74 |
+
return prediction, round((end_time - start_time), 4)
|
75 |
+
|
76 |
+
def predict_ML(user_input):
|
77 |
+
start_time = time.time()
|
78 |
+
prediction = predict_ml_class(user_input, loaded_vectorizer, loaded_classifier)
|
79 |
+
end_time = time.time()
|
80 |
+
return prediction, round((end_time - start_time), 4)
|
81 |
+
|
82 |
+
if user_input:
|
83 |
+
prediction_rnn, time_taken_rnn = predict_ML(user_input)
|
84 |
+
st.write("### Bag-of-Words + LogReg")
|
85 |
+
st.write("Предсказанный класс:", prediction_rnn)
|
86 |
+
st.write("Время предсказания:", time_taken_rnn, "сек.")
|
87 |
+
prediction_rnn, time_taken_rnn = predict_lstm(user_input)
|
88 |
+
st.write("### LSTM модель")
|
89 |
+
st.write("Предсказанный класс:", prediction_rnn)
|
90 |
+
st.write("Время предсказания:", time_taken_rnn, "сек.")
|
91 |
+
prediction_rnn, time_taken_rnn = predict_bert(user_input)
|
92 |
+
st.write("### BERT модель + LogReg")
|
93 |
+
st.write("Предсказанный класс:", prediction_rnn)
|
94 |
+
st.write("Время предсказания:", time_taken_rnn, "сек.")
|
95 |
+
|
96 |
+
|
97 |
+
st.sidebar.image('images/polyclinic.jpeg', use_column_width=True)
|
98 |
+
f1_score_classic_ml = 0.87
|
99 |
+
f1_score_rnn = 0.88
|
100 |
+
f1_score_bert = 0.83
|
101 |
+
f1_score_classic_ml_valid = 0.89
|
102 |
+
f1_score_rnn_valid = 0.92
|
103 |
+
f1_score_bert_valid = 0.82
|
104 |
+
# Создание DataFrame для сравнения результатов
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
st.sidebar.write("### Сравнительная таблица по метрике f1-macro")
|
109 |
+
results = {
|
110 |
+
"Модель": ["Классический ML", "LSTM", "BERT-based"],
|
111 |
+
"train": [f1_score_classic_ml, f1_score_rnn, f1_score_bert],
|
112 |
+
"valid": [f1_score_classic_ml_valid, f1_score_rnn_valid, f1_score_bert_valid]
|
113 |
+
}
|
114 |
+
results_df = pd.DataFrame(results)
|
115 |
+
st.sidebar.dataframe(results_df)
|
pages/pushkin.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import textwrap
|
3 |
+
import torch
|
4 |
+
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
5 |
+
|
6 |
+
DEVICE = torch.device("cpu")
|
7 |
+
# Load GPT-2 model and tokenizer
|
8 |
+
tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')
|
9 |
+
model_finetuned = GPT2LMHeadModel.from_pretrained(
|
10 |
+
'sberbank-ai/rugpt3small_based_on_gpt2',
|
11 |
+
output_attentions = False,
|
12 |
+
output_hidden_states = False,
|
13 |
+
)
|
14 |
+
if torch.cuda.is_available():
|
15 |
+
model_finetuned.load_state_dict(torch.load('models/model_pushkin.pt'))
|
16 |
+
else:
|
17 |
+
model_finetuned.load_state_dict(torch.load('models/model_pushkin.pt', map_location=torch.device('cpu')))
|
18 |
+
model_finetuned.eval()
|
19 |
+
|
20 |
+
# Function to generate text
|
21 |
+
def generate_text(prompt, temperature, top_p, max_length, top_k):
|
22 |
+
input_ids = tokenizer.encode(prompt, return_tensors="pt")
|
23 |
+
|
24 |
+
with torch.no_grad():
|
25 |
+
out = model_finetuned.generate(
|
26 |
+
input_ids,
|
27 |
+
do_sample=True,
|
28 |
+
num_beams=5,
|
29 |
+
temperature=temperature,
|
30 |
+
top_p=top_p,
|
31 |
+
max_length=max_length,
|
32 |
+
top_k=top_k,
|
33 |
+
no_repeat_ngram_size=3,
|
34 |
+
num_return_sequences=1,
|
35 |
+
)
|
36 |
+
|
37 |
+
generated_text = list(map(tokenizer.decode, out))
|
38 |
+
return generated_text
|
39 |
+
|
40 |
+
# Streamlit app
|
41 |
+
def main():
|
42 |
+
st.title("Генерация текста GPT-моделью в стиле А.С. Пушкина")
|
43 |
+
|
44 |
+
# User inputs
|
45 |
+
prompt = st.text_area("Введите начало текста")
|
46 |
+
temperature = st.slider("Temperature", min_value=0.2, max_value=2.5, value=1.8, step=0.1)
|
47 |
+
top_p = st.slider("Top-p", min_value=0.1, max_value=1.0, value=0.9, step=0.1)
|
48 |
+
max_length = st.slider("Max Length", min_value=10, max_value=300, value=100, step=10)
|
49 |
+
top_k = st.slider("Top-k", min_value=1, max_value=500, value=500, step=10)
|
50 |
+
num_return_sequences = st.slider("Number of Sequences", min_value=1, max_value=5, value=1, step=1)
|
51 |
+
|
52 |
+
if st.button("Generate Text"):
|
53 |
+
st.subheader("Generated Text:")
|
54 |
+
for i in range(num_return_sequences):
|
55 |
+
generated_text = generate_text(prompt, temperature, top_p, max_length, top_k)
|
56 |
+
st.write(f"Generated Text {i + 1}:")
|
57 |
+
wrapped_text = textwrap.fill(generated_text[0], width=80)
|
58 |
+
st.write(wrapped_text)
|
59 |
+
st.write("------------------")
|
60 |
+
|
61 |
+
st.sidebar.image('images/pushkin.jpeg', use_column_width=True)
|
62 |
+
|
63 |
+
if __name__ == "__main__":
|
64 |
+
main()
|
pages/toxic.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
4 |
+
|
5 |
+
def main():
|
6 |
+
st.title("Оценка токсичности сообщений")
|
7 |
+
|
8 |
+
# Загрузка модели
|
9 |
+
model_checkpoint = 'cointegrated/rubert-tiny-toxicity'
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
11 |
+
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
|
12 |
+
if torch.cuda.is_available():
|
13 |
+
model.cuda()
|
14 |
+
|
15 |
+
def text2toxicity(text, aggregate=True):
|
16 |
+
""" Calculate toxicity of a text (if aggregate=True) or a vector of toxicity aspects (if aggregate=False)"""
|
17 |
+
with torch.no_grad():
|
18 |
+
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
|
19 |
+
proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()
|
20 |
+
if isinstance(text, str):
|
21 |
+
proba = proba[0]
|
22 |
+
if aggregate:
|
23 |
+
return 1 - proba.T[0] * (1 - proba.T[-1])
|
24 |
+
return proba
|
25 |
+
|
26 |
+
message = st.text_area("Введите сообщение для оценки:")
|
27 |
+
if st.button("Оценить"):
|
28 |
+
if message:
|
29 |
+
toxicity_score = text2toxicity(message)
|
30 |
+
st.write(f"Степень токсичности: {toxicity_score:.4f}")
|
31 |
+
|
32 |
+
st.sidebar.image('images/toxic.jpeg', use_column_width=True)
|
33 |
+
|
34 |
+
|
35 |
+
if __name__ == "__main__":
|
36 |
+
main()
|