Spaces:
Runtime error
Runtime error
File size: 5,484 Bytes
aade6d7 a14e97f aade6d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import re
import string
import numpy as np
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('russian'))
stemmer = SnowballStemmer('russian')
sw = stopwords.words('russian')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
class LSTMClassifier(nn.Module):
def __init__(self, embedding_dim: int, hidden_size:int, embedding: torch.nn.modules.sparse.Embedding) -> None:
super().__init__()
self.embedding_dim = embedding_dim
self.hidden_size = hidden_size
self.embedding = embedding
self.lstm = nn.LSTM(
input_size=self.embedding_dim,
hidden_size=self.hidden_size,
batch_first=True
)
self.clf = nn.Linear(self.hidden_size, 1)
def forward(self, x):
embeddings = self.embedding(x)
_, (h_n, _) = self.lstm(embeddings)
out = self.clf(h_n.squeeze())
return out
def data_preprocessing(text: str) -> str:
"""preprocessing string: lowercase, removing html-tags, punctuation,
stopwords, digits
Args:
text (str): input string for preprocessing
Returns:
str: preprocessed string
"""
text = text.lower()
text = re.sub('<.*?>', '', text) # html tags
text = ''.join([c for c in text if c not in string.punctuation])# Remove punctuation
text = ' '.join([word for word in text.split() if word not in stop_words])
text = [word for word in text.split() if not word.isdigit()]
text = ' '.join(text)
return text
def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
return list(filter(lambda x: x[1] > n, sorted_words))
def padding(review_int: list, seq_len: int) -> np.array: # type: ignore
"""Make left-sided padding for input list of tokens
Args:
review_int (list): input list of tokens
seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
Returns:
np.array: padded sequences
"""
features = np.zeros((len(review_int), seq_len), dtype = int)
for i, review in enumerate(review_int):
if len(review) <= seq_len:
zeros = list(np.zeros(seq_len - len(review)))
new = zeros + review
else:
new = review[: seq_len]
features[i, :] = np.array(new)
return features
def preprocess_single_string(
input_string: str,
seq_len: int,
vocab_to_int: dict,
) -> torch.tensor:
"""Function for all preprocessing steps on a single string
Args:
input_string (str): input single string for preprocessing
seq_len (int): max length of sequence, it len(review_int[i]) > seq_len it will be trimmed, else it will be padded by zeros
vocab_to_int (dict, optional): word corpus {'word' : int index}. Defaults to vocab_to_int.
Returns:
list: preprocessed string
"""
preprocessed_string = data_preprocessing(input_string)
result_list = []
for word in preprocessed_string.split():
try:
result_list.append(vocab_to_int[word])
except KeyError as e:
print(f'{e}: not in dictionary!')
result_padded = padding([result_list], seq_len)[0]
return torch.tensor(result_padded)
def predict_sentence(text: str, model: nn.Module, seq_len: int, vocab_to_int: dict) -> str:
p_str = preprocess_single_string(text, seq_len, vocab_to_int).unsqueeze(0)
model.eval()
pred = model(p_str)
output = pred.sigmoid().round().item()
if output == 0:
return 'Негативный отзыв'
else:
return 'Позитивный отзыв'
def predict_single_string(text: str,
model: BertModel,
loaded_model: LogisticRegression
) -> str:
with torch.no_grad():
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
vector = output[0][:,0,:]
pred0 = loaded_model.predict_proba(vector)[0][0]
pred1 = loaded_model.predict_proba(vector)[0][1]
if pred0 > pred1:
return 'Негативный отзыв'
else:
return 'Позитивный отзыв'
def clean(text):
text = text.lower()
text = re.sub(r'\s+', ' ', text) # заменить два и более пробела на один пробел
text = re.sub(r'\d+', ' ', text) # удаляем числа
text = text.translate(str.maketrans('', '', string.punctuation)) # удаляем знаки пунктуации
text = re.sub(r'\n+', ' ', text) # удаляем символ перевод строки
return text
def tokin(text):
text = clean(text)
text = ' '.join([stemmer.stem(word) for word in text.split()])
text = ' '.join([word for word in text.split() if word not in sw])
return text
def predict_ml_class(text, loaded_vectorizer, loaded_classifier):
t = tokin(text).split(' ')
new_text_bow = loaded_vectorizer.transform(t)
predicted_label = loaded_classifier.predict(new_text_bow)
if predicted_label == 0:
return 'Негативный отзыв'
else:
return 'Позитивный отзыв' |