File size: 2,384 Bytes
f9d7028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# IMPORTANT NOTE: DO NOT DIRECTLY EDIT THIS FILE
# This file was manually ported from `normalize-punctuation.perl`
# TODO: Only supports English, add others

import regex as re
multispace_regex = re.compile("[ ]{2,}")
multidots_regex = re.compile(r"\.{2,}")
end_bracket_space_punc_regex = re.compile(r"\) ([\.!:?;,])")
digit_space_percent = re.compile(r"(\d) %")
double_quot_punc = re.compile(r"\"([,\.]+)")
digit_nbsp_digit = re.compile(r"(\d) (\d)")

def punc_norm(text, lang="en"):
    text = text.replace('\r', '') \
                .replace('(', " (") \
                .replace(')', ") ") \
                \
                .replace("( ", "(") \
                .replace(" )", ")") \
                \
                .replace(" :", ':') \
                .replace(" ;", ';') \
                .replace('`', "'") \
                \
                .replace('„', '"') \
                .replace('“', '"') \
                .replace('”', '"') \
                .replace('–', '-') \
                .replace('—', " - ") \
                .replace('´', "'") \
                .replace('‘', "'") \
                .replace('‚', "'") \
                .replace('’', "'") \
                .replace("''", "\"") \
                .replace("´´", '"') \
                .replace('…', "...") \
                .replace(" « ", " \"") \
                .replace("« ", '"') \
                .replace('«', '"') \
                .replace(" » ", "\" ") \
                .replace(" »", '"') \
                .replace('»', '"') \
                .replace(" %", '%') \
                .replace("nº ", "nº ") \
                .replace(" :", ':') \
                .replace(" ºC", " ºC") \
                .replace(" cm", " cm") \
                .replace(" ?", '?') \
                .replace(" !", '!') \
                .replace(" ;", ';') \
                .replace(", ", ", ") \
                
    
    text = multispace_regex.sub(' ', text)
    text = multidots_regex.sub('.', text)
    text = end_bracket_space_punc_regex.sub(r")\1", text)
    text = digit_space_percent.sub(r"\1%", text)
    text = double_quot_punc.sub(r'\1"', text) # English "quotation," followed by comma, style
    text = digit_nbsp_digit.sub(r"\1.\2", text) # What does it mean?
    return text.strip(' ')