import torch import torchtext import re def clean_text(text): # Remove extra spaces text = text.strip() # Convert multiple spaces to single spaces text = re.sub('\s+', ' ', text) # Lowercase the text text = text.lower() # Remove punctuation marks text = re.sub('[^\w\s]', '', text) return text def get_preprocess(vocab_path): tokenizer = torchtext.data.utils.get_tokenizer('basic_english') vocab = torch.load(vocab_path) return lambda text: vocab(tokenizer(clean_text(text)))