youtube-trend-prediction / preprocessText.py
xinah3131's picture
Upload 3 files
ef22d5e
raw
history blame
1.01 kB
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('corpus')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english')) # set of English stop words
lemmatizer = WordNetLemmatizer()
def preprocess(text,target_language='en'):
if not isinstance(text, str):
try:
text = str(text)
except:
raise TypeError('Input must be a string or a float')
# convert to lowercase
text = text.lower()
# Remove URLs
text = re.sub(r'http\S+', '', text)
# Remove special characters and punctuation
text = re.sub(r'[^a-zA-Z\s]', '', text)
# Removing repeated characters
text = re.sub(r'(.)\1{2,}', r'\1', text)
words = word_tokenize(text)
words = [lemmatizer.lemmatize(w) for w in words]
words = [w for w in words if not w in stop_words]
return words