|
import re |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
|
|
stop_words = set(stopwords.words('english')) |
|
lemmatizer = WordNetLemmatizer() |
|
|
|
def preprocess_text(text): |
|
|
|
text = text.lower() |
|
text = re.sub(r'http\S+|www\S+|https\S+', '', text) |
|
text = re.sub(r'<.*?>', '', text) |
|
text = re.sub(r'[^a-zA-Z\s]', '', text) |
|
text = re.sub(r'\s+', ' ', text).strip() |
|
text = ' '.join(word for word in text.split() if word not in stop_words) |
|
text = ' '.join(lemmatizer.lemmatize(word) for word in text.split()) |
|
return text |