|
from typing import Tuple
|
|
import regex as re
|
|
import sys
|
|
from tqdm import tqdm
|
|
from .indic_num_map import INDIC_NUM_MAP
|
|
|
|
|
|
URL_PATTERN = r'\b(?<![\w/.])(?:(?:https?|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b'
|
|
EMAIL_PATTERN = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
|
|
|
|
NUMERAL_PATTERN = r"(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?|\d+[-\/.:'+]\d+(?:\.\d+)?)"
|
|
|
|
OTHER_PATTERN = r'[A-Za-z0-9]*[#|@]\w+'
|
|
|
|
|
|
def normalize_indic_numerals(line: str):
|
|
"""
|
|
Normalize the numerals in Indic languages from native script to Roman script (if present).
|
|
|
|
Args:
|
|
line (str): an input string with Indic numerals to be normalized.
|
|
|
|
Returns:
|
|
str: an input string with the all Indic numerals normalized to Roman script.
|
|
"""
|
|
return "".join([INDIC_NUM_MAP.get(c, c) for c in line])
|
|
|
|
|
|
def wrap_with_placeholders(text: str, patterns: list) -> Tuple[str, dict]:
|
|
"""
|
|
Wraps substrings with matched patterns in the given text with placeholders and returns
|
|
the modified text along with a mapping of the placeholders to their original value.
|
|
|
|
Args:
|
|
text (str): an input string which needs to be wrapped with the placeholders.
|
|
pattern (list): list of patterns to search for in the input string.
|
|
|
|
Returns:
|
|
Tuple[str, dict]: a tuple containing the modified text and a dictionary mapping
|
|
placeholders to their original values.
|
|
"""
|
|
serial_no = 1
|
|
|
|
placeholder_entity_map = dict()
|
|
|
|
for pattern in patterns:
|
|
matches = set(re.findall(pattern, text))
|
|
|
|
|
|
for match in matches:
|
|
if pattern==URL_PATTERN :
|
|
|
|
temp = match.replace(".",'')
|
|
if len(temp)<4:
|
|
continue
|
|
if pattern==NUMERAL_PATTERN :
|
|
|
|
temp = match.replace(" ",'').replace(".",'').replace(":",'')
|
|
if len(temp)<4:
|
|
continue
|
|
|
|
|
|
|
|
indic_failure_cases = ['آی ڈی ', 'ꯑꯥꯏꯗꯤ', 'आईडी', 'आई . डी . ', 'ऐटि', 'آئی ڈی ', 'ᱟᱭᱰᱤ ᱾', 'आयडी', 'ऐडि', 'आइडि']
|
|
placeholder = "<ID{}>".format(serial_no)
|
|
alternate_placeholder = "< ID{} >".format(serial_no)
|
|
placeholder_entity_map[placeholder] = match
|
|
placeholder_entity_map[alternate_placeholder] = match
|
|
|
|
for i in indic_failure_cases:
|
|
placeholder_temp = "<{}{}>".format(i,serial_no)
|
|
placeholder_entity_map[placeholder_temp] = match
|
|
placeholder_temp = "< {}{} >".format(i, serial_no)
|
|
placeholder_entity_map[placeholder_temp] = match
|
|
placeholder_temp = "< {} {} >".format(i, serial_no)
|
|
placeholder_entity_map[placeholder_temp] = match
|
|
|
|
text = text.replace(match, placeholder)
|
|
serial_no+=1
|
|
|
|
text = re.sub("\s+", " ", text)
|
|
|
|
|
|
text = text.replace(">/",">")
|
|
|
|
return text, placeholder_entity_map
|
|
|
|
|
|
def normalize(text: str, patterns: list = [EMAIL_PATTERN, URL_PATTERN, NUMERAL_PATTERN, OTHER_PATTERN]) -> Tuple[str, dict]:
|
|
"""
|
|
Normalizes and wraps the spans of input string with placeholder tags. It first normalizes
|
|
the Indic numerals in the input string to Roman script. Later, it uses the input string with normalized
|
|
Indic numerals to wrap the spans of text matching the pattern with placeholder tags.
|
|
|
|
Args:
|
|
text (str): input string.
|
|
pattern (list): list of patterns to search for in the input string.
|
|
|
|
Returns:
|
|
Tuple[str, dict]: a tuple containing the modified text and a dictionary mapping
|
|
placeholders to their original values.
|
|
"""
|
|
text = normalize_indic_numerals(text.strip("\n"))
|
|
text, placeholder_entity_map = wrap_with_placeholders(text, patterns)
|
|
return text, placeholder_entity_map
|
|
|