AraBERT_claim_retrieval / global_utils.py

watheq

Upload the model

4839ed5 12 months ago

6.1 kB

	# -- coding: utf-8 --
	import re
	from snowballstemmer import stemmer
	import arabicstopwords.arabicstopwords as stp
	from tqdm import tqdm
	import pandas as pd
	import arabicstopwords.arabicstopwords as ar_stp
	from nltk.corpus import stopwords
	from nltk.tokenize import sent_tokenize, word_tokenize
	from snowballstemmer import stemmer
	from nltk.stem import PorterStemmer
	import string
	import logging
	import global_variables as gb


	ar_stemmer = stemmer("arabic")
	porter= PorterStemmer()

	# read file based on its extension (tsv or xlsx)
	def read_file(input_file, sep="\t", names = ""):
	if input_file.endswith(".xlsx"):
	df = pd.read_excel(input_file)
	else:
	if names != "":
	df = pd.read_csv(input_file, sep=sep, names=names,encoding="utf-8")
	else:
	df = pd.read_csv(input_file, sep=sep,encoding="utf-8")
	return df


	def remove_punctuation(text):
	# Removing punctuations in string using regex
	text = re.sub(r'[^\w\s]', '', text)
	return text


	#a function to normalize the tweets
	def normalize_arabic(text):
	text = re.sub("[إأٱآا]", "ا", text)
	text = re.sub("ى", "ي", text)
	text = re.sub("ؤ", "ء", text)
	text = re.sub("ئ", "ء", text)
	text = re.sub("ة", "ه", text)
	return(text)


	def remove_punctuations_tashkeel(text):
	"""
	The input should be arabic string
	"""
	punctuations = """`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+\|!”…“–ـ""" + string.punctuation

	arabic_diacritics = re.compile(
	"""
	ّ \| # Shadda
	َ \| # Fatha
	ً \| # Tanwin Fath
	ُ \| # Damma
	ٌ \| # Tanwin Damm
	ِ \| # Kasra
	ٍ \| # Tanwin Kasr
	ْ \| # Sukun
	ـ # Tatwil/Kashida
	""",
	re.VERBOSE,
	)

	# remove_punctuations
	translator = str.maketrans("", "", punctuations)
	text = text.translate(translator)

	# remove Tashkeel
	text = re.sub(arabic_diacritics, "", text)

	return text


	def remove_longation(text):
	# remove longation
	text = re.sub("[إأآا]", "ا", text)
	text = re.sub("ى", "ي", text)
	text = re.sub("ؤ", "ء", text)
	text = re.sub("ئ", "ء", text)
	text = re.sub("ة", "ه", text)
	text = re.sub("گ", "ك", text)
	return text


	def remove_harakaat(text):
	# harakaat and tatweel (kashida) to remove
	accents = re.compile(r"[\u064b-\u0652\u0640]")

	# Keep only Arabic letters/do not remove number
	arabic_punc = re.compile(r"[\u0621-\u063A\u0641-\u064A\d+]+")
	text = " ".join(arabic_punc.findall(accents.sub("", text)))
	text = text.strip()
	return text


	#removing stop sords function
	def ar_remove_stop_words(sentence):
	terms=[]
	stopWords= set(ar_stp.stopwords_list())
	for term in sentence.split() :
	if term not in stopWords :
	terms.append(term)
	return " ".join(terms)


	def ar_stem(sentence):
	return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()])


	#removing stop sords function
	def en_remove_stop_words(sentence):
	terms=[]
	stop_words= set(stopwords.words('english'))
	words = sentence.split()
	for term in words:
	if term not in stop_words :
	terms.append(term)
	return " ".join(terms)


	def en_stem(sentence):
	token_words=word_tokenize(sentence)
	return " ".join([porter.stem(word) for word in token_words])



	def clean(text):
	'''
	Clean input text form urls, handles, tabs, line jumps, and extra white spaces
	'''
	text = re.sub(r"http\S+", " ", text) # remove urls
	text = re.sub(r"RT ", " ", text) # remove rt
	text = re.sub(r"@[\w]*", " ", text) # remove handles
	text = re.sub(r"[\.\,\#_\\|\:\?\?\/\=]", " ", text)# remove special characters
	text = re.sub(r"\t", " ", text) # remove tabs
	text = re.sub(r"\n", " ", text) # remove line jump
	text = re.sub(r"\s+", " ", text) # remove extra white space
	text = text.strip()

	text = remove_emoji_smileys(text)
	return text


	def remove_emoji_smileys(text):
	try:
	# UCS-4
	EMOJIS_PATTERN = re.compile(
	u"([\U00002600-\U000027BF])\|([\U0001f300-\U0001f64F])\|([\U0001f680-\U0001f6FF])"
	)
	except re.error:
	# UCS-2
	EMOJIS_PATTERN = re.compile(
	u"([\u2600-\u27BF])\|([\uD83C][\uDF00-\uDFFF])\|([\uD83D][\uDC00-\uDE4F])\|([\uD83D][\uDE80-\uDEFF])"
	)

	SMILEYS_PATTERN = re.compile(r"(\s?:X\|:\|;\|=)(?:-)?(?:\)+\|\(\|O\|D\|P\|S\|\\\|\/\s){1,}", re.IGNORECASE)

	text = SMILEYS_PATTERN.sub(r"", text)
	text = EMOJIS_PATTERN.sub(r"", text)
	return text





	def preprocess_english(sentence):
	# apply preprocessing steps on the given sentence
	sentence = sentence.lower()
	sentence = en_remove_stop_words(sentence)
	sentence = en_stem(sentence)
	return sentence



	def preprocess_arabic(sentence): # for Arabic
	# apply preprocessing steps on the given sentence
	sentence = normalize_arabic(sentence)
	sentence = ar_remove_stop_words(sentence)
	sentence = ar_stem(sentence)
	return sentence


	def preprocess(query, lang):

	query = clean(query)
	query = remove_punctuation(query)

	if lang == "en":
	return preprocess_english(query)
	else:
	return preprocess_arabic(query)



	def initailize_logger(logger, log_file, level):

	if not len(logger.handlers): # avoid creating more than one handler
	formatter = logging.Formatter('%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
	fileHandler = logging.FileHandler(log_file)
	fileHandler.setFormatter(formatter)
	streamHandler = logging.StreamHandler()
	streamHandler.setFormatter(formatter)
	logger.setLevel(level)
	logger.addHandler(fileHandler)
	logger.addHandler(streamHandler)

	return logger