Anton commited on
Commit
d0a9720
1 Parent(s): c48c4b6
Files changed (4) hide show
  1. pages/mayakovsky.py +64 -0
  2. pages/polyclinics.py +115 -0
  3. pages/pushkin.py +64 -0
  4. pages/toxic.py +36 -0
pages/mayakovsky.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import textwrap
3
+ import torch
4
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
5
+
6
+ DEVICE = torch.device("cpu")
7
+ # Load GPT-2 model and tokenizer
8
+ tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')
9
+ model_finetuned = GPT2LMHeadModel.from_pretrained(
10
+ 'sberbank-ai/rugpt3small_based_on_gpt2',
11
+ output_attentions = False,
12
+ output_hidden_states = False,
13
+ )
14
+ if torch.cuda.is_available():
15
+ model_finetuned.load_state_dict(torch.load('models/mayakovsky.pt'))
16
+ else:
17
+ model_finetuned.load_state_dict(torch.load('models/mayakovsky.pt', map_location=torch.device('cpu')))
18
+ model_finetuned.eval()
19
+
20
+ # Function to generate text
21
+ def generate_text(prompt, temperature, top_p, max_length, top_k):
22
+ input_ids = tokenizer.encode(prompt, return_tensors="pt")
23
+
24
+ with torch.no_grad():
25
+ out = model_finetuned.generate(
26
+ input_ids,
27
+ do_sample=True,
28
+ num_beams=5,
29
+ temperature=temperature,
30
+ top_p=top_p,
31
+ max_length=max_length,
32
+ top_k=top_k,
33
+ no_repeat_ngram_size=3,
34
+ num_return_sequences=1,
35
+ )
36
+
37
+ generated_text = list(map(tokenizer.decode, out))
38
+ return generated_text
39
+
40
+ # Streamlit app
41
+ def main():
42
+ st.title("Генерация текста GPT-моделью в стиле В.В. Маяковского")
43
+
44
+ # User inputs
45
+ prompt = st.text_area("Введите начало текста")
46
+ temperature = st.slider("Temperature", min_value=0.2, max_value=2.5, value=1.8, step=0.1)
47
+ top_p = st.slider("Top-p", min_value=0.1, max_value=1.0, value=0.9, step=0.1)
48
+ max_length = st.slider("Max Length", min_value=10, max_value=300, value=100, step=10)
49
+ top_k = st.slider("Top-k", min_value=1, max_value=500, value=500, step=10)
50
+ num_return_sequences = st.slider("Number of Sequences", min_value=1, max_value=5, value=1, step=1)
51
+
52
+ if st.button("Generate Text"):
53
+ st.subheader("Generated Text:")
54
+ for i in range(num_return_sequences):
55
+ generated_text = generate_text(prompt, temperature, top_p, max_length, top_k)
56
+ st.write(f"Generated Text {i + 1}:")
57
+ wrapped_text = textwrap.fill(generated_text[0], width=80)
58
+ st.write(wrapped_text)
59
+ st.write("------------------")
60
+
61
+ st.sidebar.image('images/mayakovsky.jpeg', use_column_width=True)
62
+
63
+ if __name__ == "__main__":
64
+ main()
pages/polyclinics.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import time
4
+ import pickle
5
+ import torch
6
+ import pandas as pd
7
+ from gensim.models import KeyedVectors
8
+ from transformers import BertTokenizer, BertModel
9
+ from nltk.corpus import stopwords
10
+ from nltk.stem import SnowballStemmer
11
+ from function.lstm_preprocessing import (
12
+ clean,
13
+ tokin,
14
+ predict_ml_class,
15
+ predict_sentence,
16
+ predict_single_string,
17
+ LSTMClassifier
18
+ )
19
+
20
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
21
+
22
+ stemmer = SnowballStemmer('russian')
23
+ sw = stopwords.words('russian')
24
+
25
+ EMBEDDING_DIM = 32
26
+ HIDDEN_DIM = 32
27
+ SEQ_LEN = 200
28
+ VOCAB_SIZE = 196906
29
+ EMBEDDING_DIM = 32
30
+ wv = KeyedVectors.load("file/wv.wordvectors", mmap='r')
31
+
32
+ with open('file/vocab_to_int.txt', 'rb') as f:
33
+ vocab_to_int = pickle.load(f)
34
+
35
+ embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
36
+
37
+ for word, i in vocab_to_int.items():
38
+ try:
39
+ embedding_vector = wv[word]
40
+ embedding_matrix[i] = embedding_vector
41
+ except KeyError as e:
42
+ pass
43
+
44
+ embedding_layer = torch.nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix))
45
+
46
+ model = LSTMClassifier(embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_DIM, embedding=embedding_layer).to(DEVICE)
47
+ model.load_state_dict(torch.load('models/LTSM_model_epoch_7.pt', map_location='cpu'))
48
+
49
+ tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
50
+ model_BERT = BertModel.from_pretrained("bert-base-multilingual-cased")
51
+
52
+ loaded_model = pickle.load(open('models/LogReg.pickle', "rb"))
53
+
54
+ loaded_classifier = pickle.load(open('models/trained_model.pkl', "rb"))
55
+ loaded_vectorizer = pickle.load(open('models/vectorizer.pkl', "rb"))
56
+
57
+ def main():
58
+ st.title("Классификация отзыва на поликлиники")
59
+ user_input = st.text_area("Введите ваш отзыв:", "")
60
+ return user_input
61
+
62
+ user_input = main()
63
+
64
+ def predict_lstm(user_input):
65
+ start_time = time.time()
66
+ prediction = predict_sentence(user_input, model, SEQ_LEN, vocab_to_int)
67
+ end_time = time.time()
68
+ return prediction, round((end_time - start_time), 4)
69
+
70
+ def predict_bert(user_input):
71
+ start_time = time.time()
72
+ prediction = predict_single_string(user_input, model_BERT, loaded_model)
73
+ end_time = time.time()
74
+ return prediction, round((end_time - start_time), 4)
75
+
76
+ def predict_ML(user_input):
77
+ start_time = time.time()
78
+ prediction = predict_ml_class(user_input, loaded_vectorizer, loaded_classifier)
79
+ end_time = time.time()
80
+ return prediction, round((end_time - start_time), 4)
81
+
82
+ if user_input:
83
+ prediction_rnn, time_taken_rnn = predict_ML(user_input)
84
+ st.write("### Bag-of-Words + LogReg")
85
+ st.write("Предсказанный класс:", prediction_rnn)
86
+ st.write("Время предсказания:", time_taken_rnn, "сек.")
87
+ prediction_rnn, time_taken_rnn = predict_lstm(user_input)
88
+ st.write("### LSTM модель")
89
+ st.write("Предсказанный класс:", prediction_rnn)
90
+ st.write("Время предсказания:", time_taken_rnn, "сек.")
91
+ prediction_rnn, time_taken_rnn = predict_bert(user_input)
92
+ st.write("### BERT модель + LogReg")
93
+ st.write("Предсказанный класс:", prediction_rnn)
94
+ st.write("Время предсказания:", time_taken_rnn, "сек.")
95
+
96
+
97
+ st.sidebar.image('images/polyclinic.jpeg', use_column_width=True)
98
+ f1_score_classic_ml = 0.87
99
+ f1_score_rnn = 0.88
100
+ f1_score_bert = 0.83
101
+ f1_score_classic_ml_valid = 0.89
102
+ f1_score_rnn_valid = 0.92
103
+ f1_score_bert_valid = 0.82
104
+ # Создание DataFrame для сравнения результатов
105
+
106
+
107
+
108
+ st.sidebar.write("### Сравнительная таблица по метрике f1-macro")
109
+ results = {
110
+ "Модель": ["Классический ML", "LSTM", "BERT-based"],
111
+ "train": [f1_score_classic_ml, f1_score_rnn, f1_score_bert],
112
+ "valid": [f1_score_classic_ml_valid, f1_score_rnn_valid, f1_score_bert_valid]
113
+ }
114
+ results_df = pd.DataFrame(results)
115
+ st.sidebar.dataframe(results_df)
pages/pushkin.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import textwrap
3
+ import torch
4
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
5
+
6
+ DEVICE = torch.device("cpu")
7
+ # Load GPT-2 model and tokenizer
8
+ tokenizer = GPT2Tokenizer.from_pretrained('sberbank-ai/rugpt3small_based_on_gpt2')
9
+ model_finetuned = GPT2LMHeadModel.from_pretrained(
10
+ 'sberbank-ai/rugpt3small_based_on_gpt2',
11
+ output_attentions = False,
12
+ output_hidden_states = False,
13
+ )
14
+ if torch.cuda.is_available():
15
+ model_finetuned.load_state_dict(torch.load('models/model_pushkin.pt'))
16
+ else:
17
+ model_finetuned.load_state_dict(torch.load('models/model_pushkin.pt', map_location=torch.device('cpu')))
18
+ model_finetuned.eval()
19
+
20
+ # Function to generate text
21
+ def generate_text(prompt, temperature, top_p, max_length, top_k):
22
+ input_ids = tokenizer.encode(prompt, return_tensors="pt")
23
+
24
+ with torch.no_grad():
25
+ out = model_finetuned.generate(
26
+ input_ids,
27
+ do_sample=True,
28
+ num_beams=5,
29
+ temperature=temperature,
30
+ top_p=top_p,
31
+ max_length=max_length,
32
+ top_k=top_k,
33
+ no_repeat_ngram_size=3,
34
+ num_return_sequences=1,
35
+ )
36
+
37
+ generated_text = list(map(tokenizer.decode, out))
38
+ return generated_text
39
+
40
+ # Streamlit app
41
+ def main():
42
+ st.title("Генерация текста GPT-моделью в стиле А.С. Пушкина")
43
+
44
+ # User inputs
45
+ prompt = st.text_area("Введите начало текста")
46
+ temperature = st.slider("Temperature", min_value=0.2, max_value=2.5, value=1.8, step=0.1)
47
+ top_p = st.slider("Top-p", min_value=0.1, max_value=1.0, value=0.9, step=0.1)
48
+ max_length = st.slider("Max Length", min_value=10, max_value=300, value=100, step=10)
49
+ top_k = st.slider("Top-k", min_value=1, max_value=500, value=500, step=10)
50
+ num_return_sequences = st.slider("Number of Sequences", min_value=1, max_value=5, value=1, step=1)
51
+
52
+ if st.button("Generate Text"):
53
+ st.subheader("Generated Text:")
54
+ for i in range(num_return_sequences):
55
+ generated_text = generate_text(prompt, temperature, top_p, max_length, top_k)
56
+ st.write(f"Generated Text {i + 1}:")
57
+ wrapped_text = textwrap.fill(generated_text[0], width=80)
58
+ st.write(wrapped_text)
59
+ st.write("------------------")
60
+
61
+ st.sidebar.image('images/pushkin.jpeg', use_column_width=True)
62
+
63
+ if __name__ == "__main__":
64
+ main()
pages/toxic.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
+
5
+ def main():
6
+ st.title("Оценка токсичности сообщений")
7
+
8
+ # Загрузка модели
9
+ model_checkpoint = 'cointegrated/rubert-tiny-toxicity'
10
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
11
+ model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
12
+ if torch.cuda.is_available():
13
+ model.cuda()
14
+
15
+ def text2toxicity(text, aggregate=True):
16
+ """ Calculate toxicity of a text (if aggregate=True) or a vector of toxicity aspects (if aggregate=False)"""
17
+ with torch.no_grad():
18
+ inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True).to(model.device)
19
+ proba = torch.sigmoid(model(**inputs).logits).cpu().numpy()
20
+ if isinstance(text, str):
21
+ proba = proba[0]
22
+ if aggregate:
23
+ return 1 - proba.T[0] * (1 - proba.T[-1])
24
+ return proba
25
+
26
+ message = st.text_area("Введите сообщение для оценки:")
27
+ if st.button("Оценить"):
28
+ if message:
29
+ toxicity_score = text2toxicity(message)
30
+ st.write(f"Степень токсичности: {toxicity_score:.4f}")
31
+
32
+ st.sidebar.image('images/toxic.jpeg', use_column_width=True)
33
+
34
+
35
+ if __name__ == "__main__":
36
+ main()