model_trans_indictrans2 / IndicTrans2 /inference /normalize_regex_inference.py

Upload 79 files

f9d7028 verified 12 days ago

4.72 kB

	from typing import Tuple
	import regex as re
	import sys
	from tqdm import tqdm
	from .indic_num_map import INDIC_NUM_MAP


	URL_PATTERN = r'\b(?<![\w/.])(?:(?:https?\|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b'
	EMAIL_PATTERN = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z\|a-z]{2,}'
	# handles dates, time, percentages, proportion, ratio, etc
	NUMERAL_PATTERN = r"(~?\d+\.?\d\s?%?\s?-?\s?~?\d+\.?\d\s?%\|~?\d+%\|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?\|\d+[-\/.:'+]\d+(?:\.\d+)?)"
	# handles upi, social media handles and hashtags
	OTHER_PATTERN = r'[A-Za-z0-9]*[#\|@]\w+'


	def normalize_indic_numerals(line: str):
	"""
	Normalize the numerals in Indic languages from native script to Roman script (if present).

	Args:
	line (str): an input string with Indic numerals to be normalized.

	Returns:
	str: an input string with the all Indic numerals normalized to Roman script.
	"""
	return "".join([INDIC_NUM_MAP.get(c, c) for c in line])


	def wrap_with_placeholders(text: str, patterns: list) -> Tuple[str, dict]:
	"""
	Wraps substrings with matched patterns in the given text with placeholders and returns
	the modified text along with a mapping of the placeholders to their original value.

	Args:
	text (str): an input string which needs to be wrapped with the placeholders.
	pattern (list): list of patterns to search for in the input string.

	Returns:
	Tuple[str, dict]: a tuple containing the modified text and a dictionary mapping
	placeholders to their original values.
	"""
	serial_no = 1

	placeholder_entity_map = dict()

	for pattern in patterns:
	matches = set(re.findall(pattern, text))

	# wrap common match with placeholder tags
	for match in matches:
	if pattern==URL_PATTERN :
	#Avoids false positive URL matches for names with initials.
	temp = match.replace(".",'')
	if len(temp)<4:
	continue
	if pattern==NUMERAL_PATTERN :
	#Short numeral patterns do not need placeholder based handling.
	temp = match.replace(" ",'').replace(".",'').replace(":",'')
	if len(temp)<4:
	continue

	#Set of Translations of "ID" in all the suppported languages have been collated.
	#This has been added to deal with edge cases where placeholders might get translated.
	indic_failure_cases = ['آی ڈی ', 'ꯑꯥꯏꯗꯤ', 'आईडी', 'आई . डी . ', 'ऐटि', 'آئی ڈی ', 'ᱟᱭᱰᱤ ᱾', 'आयडी', 'ऐडि', 'आइडि']
	placeholder = "<ID{}>".format(serial_no)
	alternate_placeholder = "< ID{} >".format(serial_no)
	placeholder_entity_map[placeholder] = match
	placeholder_entity_map[alternate_placeholder] = match

	for i in indic_failure_cases:
	placeholder_temp = "<{}{}>".format(i,serial_no)
	placeholder_entity_map[placeholder_temp] = match
	placeholder_temp = "< {}{} >".format(i, serial_no)
	placeholder_entity_map[placeholder_temp] = match
	placeholder_temp = "< {} {} >".format(i, serial_no)
	placeholder_entity_map[placeholder_temp] = match

	text = text.replace(match, placeholder)
	serial_no+=1

	text = re.sub("\s+", " ", text)

	#Regex has failure cases in trailing "/" in URLs, so this is a workaround.
	text = text.replace(">/",">")

	return text, placeholder_entity_map


	def normalize(text: str, patterns: list = [EMAIL_PATTERN, URL_PATTERN, NUMERAL_PATTERN, OTHER_PATTERN]) -> Tuple[str, dict]:
	"""
	Normalizes and wraps the spans of input string with placeholder tags. It first normalizes
	the Indic numerals in the input string to Roman script. Later, it uses the input string with normalized
	Indic numerals to wrap the spans of text matching the pattern with placeholder tags.

	Args:
	text (str): input string.
	pattern (list): list of patterns to search for in the input string.

	Returns:
	Tuple[str, dict]: a tuple containing the modified text and a dictionary mapping
	placeholders to their original values.
	"""
	text = normalize_indic_numerals(text.strip("\n"))
	text, placeholder_entity_map = wrap_with_placeholders(text, patterns)
	return text, placeholder_entity_map