oranne55
/

qualifier-model4-finetune-pretrained-transformer-for-long-inputs

Text Classification

Inference Endpoints

Model card Files Files and versions Community

oranne55 commited on Nov 10, 2024

Commit

36611fd

•

1 Parent(s): c1a0c2c

Create preprocessing.py

Files changed (1) hide show

preprocessing.py +17 -0

preprocessing.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import re
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+stop_words = set(stopwords.words('english'))
+lemmatizer = WordNetLemmatizer()
+def preprocess_text(text):
+    # Text processing steps
+    text = text.lower()
+    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
+    text = re.sub(r'<.*?>', '', text)
+    text = re.sub(r'[^a-zA-Z\s]', '', text)
+    text = re.sub(r'\s+', ' ', text).strip()
+    text = ' '.join(word for word in text.split() if word not in stop_words)
+    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
+    return text