File size: 1,014 Bytes
ef22d5e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('corpus')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english')) # set of English stop words
lemmatizer = WordNetLemmatizer()
def preprocess(text,target_language='en'):
if not isinstance(text, str):
try:
text = str(text)
except:
raise TypeError('Input must be a string or a float')
# convert to lowercase
text = text.lower()
# Remove URLs
text = re.sub(r'http\S+', '', text)
# Remove special characters and punctuation
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Removing repeated characters
text = re.sub(r'(.)\1{2,}', r'\1', text)
words = word_tokenize(text)
words = [lemmatizer.lemmatize(w) for w in words]
words = [w for w in words if not w in stop_words]
return words |