# -*- coding: utf-8 -*- import re from snowballstemmer import stemmer import arabicstopwords.arabicstopwords as stp from tqdm import tqdm import pandas as pd import arabicstopwords.arabicstopwords as ar_stp from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize, word_tokenize from snowballstemmer import stemmer from nltk.stem import PorterStemmer import string import logging import global_variables as gb ar_stemmer = stemmer("arabic") porter= PorterStemmer() # read file based on its extension (tsv or xlsx) def read_file(input_file, sep="\t", names = ""): if input_file.endswith(".xlsx"): df = pd.read_excel(input_file) else: if names != "": df = pd.read_csv(input_file, sep=sep, names=names,encoding="utf-8") else: df = pd.read_csv(input_file, sep=sep,encoding="utf-8") return df def remove_punctuation(text): # Removing punctuations in string using regex text = re.sub(r'[^\w\s]', '', text) return text #a function to normalize the tweets def normalize_arabic(text): text = re.sub("[إأٱآا]", "ا", text) text = re.sub("ى", "ي", text) text = re.sub("ؤ", "ء", text) text = re.sub("ئ", "ء", text) text = re.sub("ة", "ه", text) return(text) def remove_punctuations_tashkeel(text): """ The input should be arabic string """ punctuations = """`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ""" + string.punctuation arabic_diacritics = re.compile( """ ّ | # Shadda َ | # Fatha ً | # Tanwin Fath ُ | # Damma ٌ | # Tanwin Damm ِ | # Kasra ٍ | # Tanwin Kasr ْ | # Sukun ـ # Tatwil/Kashida """, re.VERBOSE, ) # remove_punctuations translator = str.maketrans("", "", punctuations) text = text.translate(translator) # remove Tashkeel text = re.sub(arabic_diacritics, "", text) return text def remove_longation(text): # remove longation text = re.sub("[إأآا]", "ا", text) text = re.sub("ى", "ي", text) text = re.sub("ؤ", "ء", text) text = re.sub("ئ", "ء", text) text = re.sub("ة", "ه", text) text = re.sub("گ", "ك", text) return text def remove_harakaat(text): # harakaat and tatweel (kashida) to remove accents = re.compile(r"[\u064b-\u0652\u0640]") # Keep only Arabic letters/do not remove number arabic_punc = re.compile(r"[\u0621-\u063A\u0641-\u064A\d+]+") text = " ".join(arabic_punc.findall(accents.sub("", text))) text = text.strip() return text #removing stop sords function def ar_remove_stop_words(sentence): terms=[] stopWords= set(ar_stp.stopwords_list()) for term in sentence.split() : if term not in stopWords : terms.append(term) return " ".join(terms) def ar_stem(sentence): return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()]) #removing stop sords function def en_remove_stop_words(sentence): terms=[] stop_words= set(stopwords.words('english')) words = sentence.split() for term in words: if term not in stop_words : terms.append(term) return " ".join(terms) def en_stem(sentence): token_words=word_tokenize(sentence) return " ".join([porter.stem(word) for word in token_words]) def clean(text): ''' Clean input text form urls, handles, tabs, line jumps, and extra white spaces ''' text = re.sub(r"http\S+", " ", text) # remove urls text = re.sub(r"RT ", " ", text) # remove rt text = re.sub(r"@[\w]*", " ", text) # remove handles text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text)# remove special characters text = re.sub(r"\t", " ", text) # remove tabs text = re.sub(r"\n", " ", text) # remove line jump text = re.sub(r"\s+", " ", text) # remove extra white space text = text.strip() text = remove_emoji_smileys(text) return text def remove_emoji_smileys(text): try: # UCS-4 EMOJIS_PATTERN = re.compile( u"([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])" ) except re.error: # UCS-2 EMOJIS_PATTERN = re.compile( u"([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|([\uD83D][\uDE80-\uDEFF])" ) SMILEYS_PATTERN = re.compile(r"(\s?:X|:|;|=)(?:-)?(?:\)+|\(|O|D|P|S|\\|\/\s){1,}", re.IGNORECASE) text = SMILEYS_PATTERN.sub(r"", text) text = EMOJIS_PATTERN.sub(r"", text) return text def preprocess_english(sentence): # apply preprocessing steps on the given sentence sentence = sentence.lower() sentence = en_remove_stop_words(sentence) sentence = en_stem(sentence) return sentence def preprocess_arabic(sentence): # for Arabic # apply preprocessing steps on the given sentence sentence = normalize_arabic(sentence) sentence = ar_remove_stop_words(sentence) sentence = ar_stem(sentence) return sentence def preprocess(query, lang): query = clean(query) query = remove_punctuation(query) if lang == "en": return preprocess_english(query) else: return preprocess_arabic(query) def initailize_logger(logger, log_file, level): if not len(logger.handlers): # avoid creating more than one handler formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fileHandler = logging.FileHandler(log_file) fileHandler.setFormatter(formatter) streamHandler = logging.StreamHandler() streamHandler.setFormatter(formatter) logger.setLevel(level) logger.addHandler(fileHandler) logger.addHandler(streamHandler) return logger