File size: 489 Bytes
d08668b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import torch
import torchtext
import re

def clean_text(text):
	# Remove extra spaces
	text = text.strip()
	# Convert multiple spaces to single spaces
	text = re.sub('\s+', ' ', text)
	# Lowercase the text
	text = text.lower()
	# Remove punctuation marks
	text = re.sub('[^\w\s]', '', text)
	return text

def get_preprocess(vocab_path):
	tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
	vocab = torch.load(vocab_path)
	return lambda text: vocab(tokenizer(clean_text(text)))