|
|
|
import re |
|
from snowballstemmer import stemmer |
|
import arabicstopwords.arabicstopwords as stp |
|
from tqdm import tqdm |
|
import pandas as pd |
|
import arabicstopwords.arabicstopwords as ar_stp |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import sent_tokenize, word_tokenize |
|
from snowballstemmer import stemmer |
|
from nltk.stem import PorterStemmer |
|
import string |
|
import logging |
|
import global_variables as gb |
|
|
|
|
|
ar_stemmer = stemmer("arabic") |
|
porter= PorterStemmer() |
|
|
|
|
|
def read_file(input_file, sep="\t", names = ""): |
|
if input_file.endswith(".xlsx"): |
|
df = pd.read_excel(input_file) |
|
else: |
|
if names != "": |
|
df = pd.read_csv(input_file, sep=sep, names=names,encoding="utf-8") |
|
else: |
|
df = pd.read_csv(input_file, sep=sep,encoding="utf-8") |
|
return df |
|
|
|
|
|
def remove_punctuation(text): |
|
|
|
text = re.sub(r'[^\w\s]', '', text) |
|
return text |
|
|
|
|
|
|
|
def normalize_arabic(text): |
|
text = re.sub("[إأٱآا]", "ا", text) |
|
text = re.sub("ى", "ي", text) |
|
text = re.sub("ؤ", "ء", text) |
|
text = re.sub("ئ", "ء", text) |
|
text = re.sub("ة", "ه", text) |
|
return(text) |
|
|
|
|
|
def remove_punctuations_tashkeel(text): |
|
""" |
|
The input should be arabic string |
|
""" |
|
punctuations = """`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ""" + string.punctuation |
|
|
|
arabic_diacritics = re.compile( |
|
""" |
|
ّ | # Shadda |
|
َ | # Fatha |
|
ً | # Tanwin Fath |
|
ُ | # Damma |
|
ٌ | # Tanwin Damm |
|
ِ | # Kasra |
|
ٍ | # Tanwin Kasr |
|
ْ | # Sukun |
|
ـ # Tatwil/Kashida |
|
""", |
|
re.VERBOSE, |
|
) |
|
|
|
|
|
translator = str.maketrans("", "", punctuations) |
|
text = text.translate(translator) |
|
|
|
|
|
text = re.sub(arabic_diacritics, "", text) |
|
|
|
return text |
|
|
|
|
|
def remove_longation(text): |
|
|
|
text = re.sub("[إأآا]", "ا", text) |
|
text = re.sub("ى", "ي", text) |
|
text = re.sub("ؤ", "ء", text) |
|
text = re.sub("ئ", "ء", text) |
|
text = re.sub("ة", "ه", text) |
|
text = re.sub("گ", "ك", text) |
|
return text |
|
|
|
|
|
def remove_harakaat(text): |
|
|
|
accents = re.compile(r"[\u064b-\u0652\u0640]") |
|
|
|
|
|
arabic_punc = re.compile(r"[\u0621-\u063A\u0641-\u064A\d+]+") |
|
text = " ".join(arabic_punc.findall(accents.sub("", text))) |
|
text = text.strip() |
|
return text |
|
|
|
|
|
|
|
def ar_remove_stop_words(sentence): |
|
terms=[] |
|
stopWords= set(ar_stp.stopwords_list()) |
|
for term in sentence.split() : |
|
if term not in stopWords : |
|
terms.append(term) |
|
return " ".join(terms) |
|
|
|
|
|
def ar_stem(sentence): |
|
return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()]) |
|
|
|
|
|
|
|
def en_remove_stop_words(sentence): |
|
terms=[] |
|
stop_words= set(stopwords.words('english')) |
|
words = sentence.split() |
|
for term in words: |
|
if term not in stop_words : |
|
terms.append(term) |
|
return " ".join(terms) |
|
|
|
|
|
def en_stem(sentence): |
|
token_words=word_tokenize(sentence) |
|
return " ".join([porter.stem(word) for word in token_words]) |
|
|
|
|
|
|
|
def clean(text): |
|
''' |
|
Clean input text form urls, handles, tabs, line jumps, and extra white spaces |
|
''' |
|
text = re.sub(r"http\S+", " ", text) |
|
text = re.sub(r"RT ", " ", text) |
|
text = re.sub(r"@[\w]*", " ", text) |
|
text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text) |
|
text = re.sub(r"\t", " ", text) |
|
text = re.sub(r"\n", " ", text) |
|
text = re.sub(r"\s+", " ", text) |
|
text = text.strip() |
|
|
|
text = remove_emoji_smileys(text) |
|
return text |
|
|
|
|
|
def remove_emoji_smileys(text): |
|
try: |
|
|
|
EMOJIS_PATTERN = re.compile( |
|
u"([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])" |
|
) |
|
except re.error: |
|
|
|
EMOJIS_PATTERN = re.compile( |
|
u"([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])" |
|
) |
|
|
|
SMILEYS_PATTERN = re.compile(r"(\s?:X|:|;|=)(?:-)?(?:\)+|\(|O|D|P|S|\\|\/\s){1,}", re.IGNORECASE) |
|
|
|
text = SMILEYS_PATTERN.sub(r"", text) |
|
text = EMOJIS_PATTERN.sub(r"", text) |
|
return text |
|
|
|
|
|
|
|
|
|
|
|
def preprocess_english(sentence): |
|
|
|
sentence = sentence.lower() |
|
sentence = en_remove_stop_words(sentence) |
|
sentence = en_stem(sentence) |
|
return sentence |
|
|
|
|
|
|
|
def preprocess_arabic(sentence): |
|
|
|
sentence = normalize_arabic(sentence) |
|
sentence = ar_remove_stop_words(sentence) |
|
sentence = ar_stem(sentence) |
|
return sentence |
|
|
|
|
|
def preprocess(query, lang): |
|
|
|
query = clean(query) |
|
query = remove_punctuation(query) |
|
|
|
if lang == "en": |
|
return preprocess_english(query) |
|
else: |
|
return preprocess_arabic(query) |
|
|
|
|
|
|
|
def initailize_logger(logger, log_file, level): |
|
|
|
if not len(logger.handlers): |
|
formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') |
|
fileHandler = logging.FileHandler(log_file) |
|
fileHandler.setFormatter(formatter) |
|
streamHandler = logging.StreamHandler() |
|
streamHandler.setFormatter(formatter) |
|
logger.setLevel(level) |
|
logger.addHandler(fileHandler) |
|
logger.addHandler(streamHandler) |
|
|
|
return logger |
|
|
|
|
|
|