File size: 4,724 Bytes
f9d7028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from typing import Tuple
import regex as re
import sys
from tqdm import tqdm
from .indic_num_map import INDIC_NUM_MAP


URL_PATTERN = r'\b(?<![\w/.])(?:(?:https?|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b'
EMAIL_PATTERN = r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}'
# handles dates, time, percentages, proportion, ratio, etc
NUMERAL_PATTERN = r"(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?|\d+[-\/.:'+]\d+(?:\.\d+)?)"
# handles upi, social media handles and hashtags
OTHER_PATTERN = r'[A-Za-z0-9]*[#|@]\w+'


def normalize_indic_numerals(line: str):
    """

    Normalize the numerals in Indic languages from native script to Roman script (if present).

    

    Args:

        line (str): an input string with Indic numerals to be normalized.

    

    Returns:

        str: an input string with the all Indic numerals normalized to Roman script.

    """
    return "".join([INDIC_NUM_MAP.get(c, c) for c in line])


def wrap_with_placeholders(text: str, patterns: list) -> Tuple[str, dict]:
    """

    Wraps substrings with matched patterns in the given text with placeholders and returns

    the modified text along with a mapping of the placeholders to their original value.

    

    Args:

        text (str): an input string which needs to be wrapped with the placeholders.

        pattern (list): list of patterns to search for in the input string.

    

    Returns:

        Tuple[str, dict]: a tuple containing the modified text and a dictionary mapping 

            placeholders to their original values.

    """
    serial_no = 1
    
    placeholder_entity_map = dict()
    
    for pattern in patterns:
        matches = set(re.findall(pattern, text))
        
        # wrap common match with placeholder tags
        for match in matches:
            if pattern==URL_PATTERN :
                #Avoids false positive URL matches for names with initials.
                temp = match.replace(".",'')
                if len(temp)<4:
                    continue
            if pattern==NUMERAL_PATTERN :
                #Short numeral patterns do not need placeholder based handling.
                temp = match.replace(" ",'').replace(".",'').replace(":",'')
                if len(temp)<4:
                    continue
            
            #Set of Translations of "ID" in all the suppported languages have been collated.            
            #This has been added to deal with edge cases where placeholders might get translated. 
            indic_failure_cases = ['آی ڈی ', 'ꯑꯥꯏꯗꯤ', 'आईडी', 'आई . डी . ', 'ऐटि', 'آئی ڈی ', 'ᱟᱭᱰᱤ ᱾', 'आयडी', 'ऐडि', 'आइडि']         
            placeholder = "<ID{}>".format(serial_no)
            alternate_placeholder = "< ID{} >".format(serial_no)                    
            placeholder_entity_map[placeholder] = match
            placeholder_entity_map[alternate_placeholder] = match
            
            for i in indic_failure_cases:
                placeholder_temp = "<{}{}>".format(i,serial_no)
                placeholder_entity_map[placeholder_temp] = match
                placeholder_temp = "< {}{} >".format(i, serial_no)
                placeholder_entity_map[placeholder_temp] = match
                placeholder_temp = "< {} {} >".format(i, serial_no)
                placeholder_entity_map[placeholder_temp] = match
            
            text = text.replace(match, placeholder)
            serial_no+=1
    
    text = re.sub("\s+", " ", text)
    
    #Regex has failure cases in trailing "/" in URLs, so this is a workaround. 
    text = text.replace(">/",">")
        
    return text, placeholder_entity_map


def normalize(text: str, patterns: list = [EMAIL_PATTERN, URL_PATTERN, NUMERAL_PATTERN, OTHER_PATTERN]) -> Tuple[str, dict]:
    """

    Normalizes and wraps the spans of input string with placeholder tags. It first normalizes

    the Indic numerals in the input string to Roman script. Later, it uses the input string with normalized

    Indic numerals to wrap the spans of text matching the pattern with placeholder tags.

    

    Args:

        text (str): input string.

        pattern (list): list of patterns to search for in the input string.

    

    Returns:

        Tuple[str, dict]: a tuple containing the modified text and a dictionary mapping 

            placeholders to their original values.

    """
    text = normalize_indic_numerals(text.strip("\n"))
    text, placeholder_entity_map  = wrap_with_placeholders(text, patterns)
    return text, placeholder_entity_map