# -*- coding: utf-8 -*-
import re
from snowballstemmer import stemmer
import arabicstopwords.arabicstopwords as stp
from tqdm import tqdm
import pandas as pd
import arabicstopwords.arabicstopwords as ar_stp
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from snowballstemmer import stemmer
from nltk.stem import PorterStemmer
import string
import logging
import global_variables as gb
ar_stemmer = stemmer("arabic")
porter= PorterStemmer()
# read file based on its extension (tsv or xlsx)
def read_file(input_file, sep="\t", names = ""):
if input_file.endswith(".xlsx"):
df = pd.read_excel(input_file)
if names != "":
df = pd.read_csv(input_file, sep=sep, names=names,encoding="utf-8")
df = pd.read_csv(input_file, sep=sep,encoding="utf-8")
return df
def remove_punctuation(text):
# Removing punctuations in string using regex
text = re.sub(r'[^\w\s]', '', text)
return text
#a function to normalize the tweets
def normalize_arabic(text):
text = re.sub("[إأٱآا]", "ا", text)
text = re.sub("ى", "ي", text)
text = re.sub("ؤ", "ء", text)
text = re.sub("ئ", "ء", text)
text = re.sub("ة", "ه", text)
def remove_punctuations_tashkeel(text):
The input should be arabic string
punctuations = """`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ""" + string.punctuation
arabic_diacritics = re.compile(
ّ | # Shadda
َ | # Fatha
ً | # Tanwin Fath
ُ | # Damma
ٌ | # Tanwin Damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
# remove_punctuations
translator = str.maketrans("", "", punctuations)
text = text.translate(translator)
# remove Tashkeel
text = re.sub(arabic_diacritics, "", text)
return text
def remove_longation(text):
# remove longation
text = re.sub("[إأآا]", "ا", text)
text = re.sub("ى", "ي", text)
text = re.sub("ؤ", "ء", text)
text = re.sub("ئ", "ء", text)
text = re.sub("ة", "ه", text)
text = re.sub("گ", "ك", text)
return text
def remove_harakaat(text):
# harakaat and tatweel (kashida) to remove
accents = re.compile(r"[\u064b-\u0652\u0640]")
# Keep only Arabic letters/do not remove number
arabic_punc = re.compile(r"[\u0621-\u063A\u0641-\u064A\d+]+")
text = " ".join(arabic_punc.findall(accents.sub("", text)))
text = text.strip()
return text
#removing stop sords function
def ar_remove_stop_words(sentence):
stopWords= set(ar_stp.stopwords_list())
for term in sentence.split() :
if term not in stopWords :
return " ".join(terms)
def ar_stem(sentence):
return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()])
#removing stop sords function
def en_remove_stop_words(sentence):
stop_words= set(stopwords.words('english'))
words = sentence.split()
for term in words:
if term not in stop_words :
return " ".join(terms)
def en_stem(sentence):
return " ".join([porter.stem(word) for word in token_words])
def clean(text):
Clean input text form urls, handles, tabs, line jumps, and extra white spaces
text = re.sub(r"http\S+", " ", text) # remove urls
text = re.sub(r"RT ", " ", text) # remove rt
text = re.sub(r"@[\w]*", " ", text) # remove handles
text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text)# remove special characters
text = re.sub(r"\t", " ", text) # remove tabs
text = re.sub(r"\n", " ", text) # remove line jump
text = re.sub(r"\s+", " ", text) # remove extra white space
text = text.strip()
text = remove_emoji_smileys(text)
return text
def remove_emoji_smileys(text):
# UCS-4
EMOJIS_PATTERN = re.compile(
except re.error:
# UCS-2
EMOJIS_PATTERN = re.compile(
SMILEYS_PATTERN = re.compile(r"(\s?:X|:|;|=)(?:-)?(?:\)+|\(|O|D|P|S|\\|\/\s){1,}", re.IGNORECASE)
text = SMILEYS_PATTERN.sub(r"", text)
text = EMOJIS_PATTERN.sub(r"", text)
return text
def preprocess_english(sentence):
# apply preprocessing steps on the given sentence
sentence = sentence.lower()
sentence = en_remove_stop_words(sentence)
sentence = en_stem(sentence)
return sentence
def preprocess_arabic(sentence): # for Arabic
# apply preprocessing steps on the given sentence
sentence = normalize_arabic(sentence)
sentence = ar_remove_stop_words(sentence)
sentence = ar_stem(sentence)
return sentence
def preprocess(query, lang):
query = clean(query)
query = remove_punctuation(query)
if lang == "en":
return preprocess_english(query)
return preprocess_arabic(query)
def initailize_logger(logger, log_file, level):
if not len(logger.handlers): # avoid creating more than one handler
formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
fileHandler = logging.FileHandler(log_file)
streamHandler = logging.StreamHandler()
return logger