RealKintaro's picture
Init
7f9da02
import re
import string
import nltk
nltk.download('stopwords')
arabic_stopwords = set(nltk.corpus.stopwords.words("arabic"))
arabic_diacritics = re.compile("""
ู‘ | # Tashdid
ูŽ | # Fatha
ู‹ | # Tanwin Fath
ู | # Damma
ูŒ | # Tanwin Damm
ู | # Kasra
ู | # Tanwin Kasr
ู’ | # Sukun
ู€ # Tatwil/Kashida
""", re.VERBOSE)
arabic_punctuations = '''`รทร—ุ›<>_()*&^%][ู€ุŒ/:"ุŸ.,'{}~ยฆ+|!โ€โ€ฆโ€œโ€“ู€'''
english_punctuations = string.punctuation
punctuations = arabic_punctuations + english_punctuations
def remove_urls (text):
text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
return text
def remove_emails(text):
text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", "", text, flags=re.MULTILINE)
return text
# def remove_emoji(text):
# return emoji.get_emoji_regexp().sub(u'', text)
def remove_emoji(data):
emoj = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002500-\U00002BEF" # chinese char
u"\U00002702-\U000027B0"
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u"\U00010000-\U0010ffff"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u200d"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\ufe0f" # dingbats
u"\u3030"
"]+", re.UNICODE)
return re.sub(emoj, '', data)
def normalization(text):
text = re.sub("[ุฅุฃุขุง]", "ุง", text)
text = re.sub("ู‰", "ูŠ", text)
text = re.sub("ุค", "ุก", text)
text = re.sub("ุฆ", "ุก", text)
text = re.sub("ุฉ", "ู‡", text)
text = re.sub("ฺฏ", "ูƒ", text)
return text
def remove_diacritics(text):
text = re.sub(arabic_diacritics, '', text)
return text
def remove_stopwords(text):
filtered_sentence = [w for w in text.split() if not w in arabic_stopwords]
return ' '.join(filtered_sentence)
def cleaning_content(line):
if (isinstance(line, float)):
return None
line.replace('\n', ' ')
line = remove_emails(line)
line = remove_urls(line)
line = remove_emoji(line)
nline = [w if '@' not in w else 'USERID' for w in line.split()]
line = ' '.join(nline)
line = line.replace('RT', '').replace('<LF>', '').replace('<br />','').replace('&quot;', '').replace('<url>', '').replace('USERID', '')
# add spaces between punc,
line = line.translate(str.maketrans({key: " {0} ".format(key) for key in punctuations}))
# then remove punc,
translator = str.maketrans('', '', punctuations)
line = line.translate(translator)
line = remove_stopwords(line)
line=remove_diacritics(normalization(line))
line = line.strip()
return line
def hasDigits(s):
return any( 48 <= ord(char) <= 57 or 1632 <= ord(char) <= 1641 for char in s)