model_trans_indictrans2 / IndicTrans2 /inference /normalize_punctuation.py
GEETHANAYAGI's picture
Upload 79 files
f9d7028 verified
raw
history blame
2.38 kB
# IMPORTANT NOTE: DO NOT DIRECTLY EDIT THIS FILE
# This file was manually ported from `normalize-punctuation.perl`
# TODO: Only supports English, add others
import regex as re
multispace_regex = re.compile("[ ]{2,}")
multidots_regex = re.compile(r"\.{2,}")
end_bracket_space_punc_regex = re.compile(r"\) ([\.!:?;,])")
digit_space_percent = re.compile(r"(\d) %")
double_quot_punc = re.compile(r"\"([,\.]+)")
digit_nbsp_digit = re.compile(r"(\d) (\d)")
def punc_norm(text, lang="en"):
text = text.replace('\r', '') \
.replace('(', " (") \
.replace(')', ") ") \
\
.replace("( ", "(") \
.replace(" )", ")") \
\
.replace(" :", ':') \
.replace(" ;", ';') \
.replace('`', "'") \
\
.replace('„', '"') \
.replace('“', '"') \
.replace('”', '"') \
.replace('–', '-') \
.replace('—', " - ") \
.replace('´', "'") \
.replace('‘', "'") \
.replace('‚', "'") \
.replace('’', "'") \
.replace("''", "\"") \
.replace("´´", '"') \
.replace('…', "...") \
.replace(" « ", " \"") \
.replace("« ", '"') \
.replace('«', '"') \
.replace(" » ", "\" ") \
.replace(" »", '"') \
.replace('»', '"') \
.replace(" %", '%') \
.replace("nº ", "nº ") \
.replace(" :", ':') \
.replace(" ºC", " ºC") \
.replace(" cm", " cm") \
.replace(" ?", '?') \
.replace(" !", '!') \
.replace(" ;", ';') \
.replace(", ", ", ") \
text = multispace_regex.sub(' ', text)
text = multidots_regex.sub('.', text)
text = end_bracket_space_punc_regex.sub(r")\1", text)
text = digit_space_percent.sub(r"\1%", text)
text = double_quot_punc.sub(r'\1"', text) # English "quotation," followed by comma, style
text = digit_nbsp_digit.sub(r"\1.\2", text) # What does it mean?
return text.strip(' ')