oranne55 commited on
Commit
36611fd
1 Parent(s): c1a0c2c

Create preprocessing.py

Browse files
Files changed (1) hide show
  1. preprocessing.py +17 -0
preprocessing.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from nltk.corpus import stopwords
3
+ from nltk.stem import WordNetLemmatizer
4
+
5
+ stop_words = set(stopwords.words('english'))
6
+ lemmatizer = WordNetLemmatizer()
7
+
8
+ def preprocess_text(text):
9
+ # Text processing steps
10
+ text = text.lower()
11
+ text = re.sub(r'http\S+|www\S+|https\S+', '', text)
12
+ text = re.sub(r'<.*?>', '', text)
13
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
14
+ text = re.sub(r'\s+', ' ', text).strip()
15
+ text = ' '.join(word for word in text.split() if word not in stop_words)
16
+ text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
17
+ return text