|
import re |
|
import string |
|
import nltk |
|
nltk.download('stopwords') |
|
|
|
|
|
arabic_stopwords = set(nltk.corpus.stopwords.words("arabic")) |
|
|
|
arabic_diacritics = re.compile(""" |
|
ู | # Tashdid |
|
ู | # Fatha |
|
ู | # Tanwin Fath |
|
ู | # Damma |
|
ู | # Tanwin Damm |
|
ู | # Kasra |
|
ู | # Tanwin Kasr |
|
ู | # Sukun |
|
ู # Tatwil/Kashida |
|
""", re.VERBOSE) |
|
|
|
arabic_punctuations = '''`รทรุ<>_()*&^%][ูุ/:"ุ.,'{}~ยฆ+|!โโฆโโู''' |
|
english_punctuations = string.punctuation |
|
punctuations = arabic_punctuations + english_punctuations |
|
|
|
|
|
def remove_urls (text): |
|
text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE) |
|
return text |
|
|
|
|
|
def remove_emails(text): |
|
text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "", text, flags=re.MULTILINE) |
|
return text |
|
|
|
|
|
|
|
|
|
def remove_emoji(data): |
|
emoj = re.compile("[" |
|
u"\U0001F600-\U0001F64F" |
|
u"\U0001F300-\U0001F5FF" |
|
u"\U0001F680-\U0001F6FF" |
|
u"\U0001F1E0-\U0001F1FF" |
|
u"\U00002500-\U00002BEF" |
|
u"\U00002702-\U000027B0" |
|
u"\U00002702-\U000027B0" |
|
u"\U000024C2-\U0001F251" |
|
u"\U0001f926-\U0001f937" |
|
u"\U00010000-\U0010ffff" |
|
u"\u2640-\u2642" |
|
u"\u2600-\u2B55" |
|
u"\u200d" |
|
u"\u23cf" |
|
u"\u23e9" |
|
u"\u231a" |
|
u"\ufe0f" |
|
u"\u3030" |
|
"]+", re.UNICODE) |
|
return re.sub(emoj, '', data) |
|
|
|
def normalization(text): |
|
text = re.sub("[ุฅุฃุขุง]", "ุง", text) |
|
text = re.sub("ู", "ู", text) |
|
text = re.sub("ุค", "ุก", text) |
|
text = re.sub("ุฆ", "ุก", text) |
|
text = re.sub("ุฉ", "ู", text) |
|
text = re.sub("ฺฏ", "ู", text) |
|
return text |
|
|
|
def remove_diacritics(text): |
|
text = re.sub(arabic_diacritics, '', text) |
|
return text |
|
|
|
def remove_stopwords(text): |
|
filtered_sentence = [w for w in text.split() if not w in arabic_stopwords] |
|
return ' '.join(filtered_sentence) |
|
|
|
def cleaning_content(line): |
|
if (isinstance(line, float)): |
|
return None |
|
line.replace('\n', ' ') |
|
line = remove_emails(line) |
|
line = remove_urls(line) |
|
line = remove_emoji(line) |
|
nline = [w if '@' not in w else 'USERID' for w in line.split()] |
|
line = ' '.join(nline) |
|
line = line.replace('RT', '').replace('<LF>', '').replace('<br />','').replace('"', '').replace('<url>', '').replace('USERID', '') |
|
|
|
|
|
|
|
line = line.translate(str.maketrans({key: " {0} ".format(key) for key in punctuations})) |
|
|
|
|
|
translator = str.maketrans('', '', punctuations) |
|
line = line.translate(translator) |
|
|
|
line = remove_stopwords(line) |
|
line=remove_diacritics(normalization(line)) |
|
|
|
line = line.strip() |
|
return line |
|
|
|
def hasDigits(s): |
|
return any( 48 <= ord(char) <= 57 or 1632 <= ord(char) <= 1641 for char in s) |