|
import re |
|
import string |
|
import numpy as np |
|
import torch |
|
import unicodedata |
|
import nltk |
|
|
|
|
|
nltk.download('stopwords') |
|
from nltk.corpus import stopwords |
|
stop_words = set(stopwords.words('russian', 'english')) |
|
|
|
def data_preprocessing(text: str) -> str: |
|
|
|
text = text.lower() |
|
text = text.replace('-', ' ').replace('\n', ' ') |
|
|
|
text = re.sub('<.*?>', '', text) |
|
text = ''.join([c for c in text if unicodedata.category(c).startswith(('L', 'N', 'Z')) or c == "'"]) |
|
text = ' '.join([word for word in text.split() if word.lower() not in stop_words]) |
|
text = ' '.join([word for word in text.split() if not word.isdigit()]) |
|
return text |
|
|
|
|
|
def get_words_by_freq(sorted_words: list, n: int = 10) -> list: |
|
return list(filter(lambda x: x[1] > n, sorted_words)) |
|
|
|
def padding(review_int: list, seq_len: int) -> np.array: |
|
|
|
features = np.zeros((len(review_int), seq_len), dtype = int) |
|
for i, review in enumerate(review_int): |
|
if len(review) <= seq_len: |
|
zeros = list(np.zeros(seq_len - len(review))) |
|
new = zeros + review |
|
else: |
|
new = review[: seq_len] |
|
features[i, :] = np.array(new) |
|
|
|
return features |
|
|
|
def preprocess_single_string( |
|
input_string: str, |
|
seq_len: int, |
|
vocab_to_int: dict, |
|
verbose : bool = False |
|
) -> torch.tensor: |
|
|
|
|
|
preprocessed_string = data_preprocessing(input_string) |
|
result_list = [] |
|
for word in preprocessed_string.split(): |
|
try: |
|
result_list.append(vocab_to_int[word]) |
|
except KeyError as e: |
|
if verbose: |
|
print(f'{e}: not in dictionary!') |
|
pass |
|
result_padded = padding([result_list], seq_len)[0] |
|
|
|
return torch.tensor(result_padded) |