sentiment-analysis / utils /preprocess.py
geetu040's picture
Initial Upload
d08668b
raw
history blame
489 Bytes
import torch
import torchtext
import re
def clean_text(text):
# Remove extra spaces
text = text.strip()
# Convert multiple spaces to single spaces
text = re.sub('\s+', ' ', text)
# Lowercase the text
text = text.lower()
# Remove punctuation marks
text = re.sub('[^\w\s]', '', text)
return text
def get_preprocess(vocab_path):
tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
vocab = torch.load(vocab_path)
return lambda text: vocab(tokenizer(clean_text(text)))