wav2vec2-large-xlsr-georgian / normalizer.py

HP-tune and optimize the model for a better performance

4462fcd over 3 years ago

1.79 kB

	import re
	import string


	chars_to_ignore = [
	",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
	"#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?",
	".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
	]
	chars_to_ignore = f"""[{"".join(chars_to_ignore)}]"""

	dictionary_mapping = {
	"\u200c": " ",
	"\u200d": " ",
	"\u200e": " ",
	"\u200f": " ",
	"\ufeff": " ",
	"\u0307": " ",
	}


	def multiple_replace(text, chars_to_mapping):
	pattern = "\|".join(map(re.escape, chars_to_mapping.keys()))
	return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))


	def remove_special_characters(text, chars_to_ignore_regex):
	text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
	return text


	def normalizer_at_word_level(text):
	words = text.split()
	_text = []

	for word in words:
	# Normalizer at word level
	_text.append(word)

	return " ".join(_text) + " "

	def normalizer(batch, return_dict=True, filter_trivials=False, remove_extra_space=False):
	text = batch["sentence"].lower().strip()

	# Dictionary mapping
	text = multiple_replace(text, dictionary_mapping)
	text = re.sub(" +", " ", text)

	# Remove specials
	text = remove_special_characters(text, chars_to_ignore)
	text = re.sub(" +", " ", text)

	# Normalizer at word level
	text = normalizer_at_word_level(text)
	text = re.sub(" +", " ", text)

	if remove_extra_space:
	text = text.strip()
	else:
	text = text.strip() + " "

	if filter_trivials:
	if not len(text) > 2:
	text = None

	if not return_dict:
	return text

	batch["sentence"] = text
	return batch