File size: 2,384 Bytes
f9d7028 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# IMPORTANT NOTE: DO NOT DIRECTLY EDIT THIS FILE
# This file was manually ported from `normalize-punctuation.perl`
# TODO: Only supports English, add others
import regex as re
multispace_regex = re.compile("[ ]{2,}")
multidots_regex = re.compile(r"\.{2,}")
end_bracket_space_punc_regex = re.compile(r"\) ([\.!:?;,])")
digit_space_percent = re.compile(r"(\d) %")
double_quot_punc = re.compile(r"\"([,\.]+)")
digit_nbsp_digit = re.compile(r"(\d) (\d)")
def punc_norm(text, lang="en"):
text = text.replace('\r', '') \
.replace('(', " (") \
.replace(')', ") ") \
\
.replace("( ", "(") \
.replace(" )", ")") \
\
.replace(" :", ':') \
.replace(" ;", ';') \
.replace('`', "'") \
\
.replace('„', '"') \
.replace('“', '"') \
.replace('”', '"') \
.replace('–', '-') \
.replace('—', " - ") \
.replace('´', "'") \
.replace('‘', "'") \
.replace('‚', "'") \
.replace('’', "'") \
.replace("''", "\"") \
.replace("´´", '"') \
.replace('…', "...") \
.replace(" « ", " \"") \
.replace("« ", '"') \
.replace('«', '"') \
.replace(" » ", "\" ") \
.replace(" »", '"') \
.replace('»', '"') \
.replace(" %", '%') \
.replace("nº ", "nº ") \
.replace(" :", ':') \
.replace(" ºC", " ºC") \
.replace(" cm", " cm") \
.replace(" ?", '?') \
.replace(" !", '!') \
.replace(" ;", ';') \
.replace(", ", ", ") \
text = multispace_regex.sub(' ', text)
text = multidots_regex.sub('.', text)
text = end_bracket_space_punc_regex.sub(r")\1", text)
text = digit_space_percent.sub(r"\1%", text)
text = double_quot_punc.sub(r'\1"', text) # English "quotation," followed by comma, style
text = digit_nbsp_digit.sub(r"\1.\2", text) # What does it mean?
return text.strip(' ') |