|
|
|
|
|
|
|
|
|
import regex as re
|
|
multispace_regex = re.compile("[ ]{2,}")
|
|
multidots_regex = re.compile(r"\.{2,}")
|
|
end_bracket_space_punc_regex = re.compile(r"\) ([\.!:?;,])")
|
|
digit_space_percent = re.compile(r"(\d) %")
|
|
double_quot_punc = re.compile(r"\"([,\.]+)")
|
|
digit_nbsp_digit = re.compile(r"(\d) (\d)")
|
|
|
|
def punc_norm(text, lang="en"):
|
|
text = text.replace('\r', '') \
|
|
.replace('(', " (") \
|
|
.replace(')', ") ") \
|
|
\
|
|
.replace("( ", "(") \
|
|
.replace(" )", ")") \
|
|
\
|
|
.replace(" :", ':') \
|
|
.replace(" ;", ';') \
|
|
.replace('`', "'") \
|
|
\
|
|
.replace('„', '"') \
|
|
.replace('“', '"') \
|
|
.replace('”', '"') \
|
|
.replace('–', '-') \
|
|
.replace('—', " - ") \
|
|
.replace('´', "'") \
|
|
.replace('‘', "'") \
|
|
.replace('‚', "'") \
|
|
.replace('’', "'") \
|
|
.replace("''", "\"") \
|
|
.replace("´´", '"') \
|
|
.replace('…', "...") \
|
|
.replace(" « ", " \"") \
|
|
.replace("« ", '"') \
|
|
.replace('«', '"') \
|
|
.replace(" » ", "\" ") \
|
|
.replace(" »", '"') \
|
|
.replace('»', '"') \
|
|
.replace(" %", '%') \
|
|
.replace("nº ", "nº ") \
|
|
.replace(" :", ':') \
|
|
.replace(" ºC", " ºC") \
|
|
.replace(" cm", " cm") \
|
|
.replace(" ?", '?') \
|
|
.replace(" !", '!') \
|
|
.replace(" ;", ';') \
|
|
.replace(", ", ", ") \
|
|
|
|
|
|
text = multispace_regex.sub(' ', text)
|
|
text = multidots_regex.sub('.', text)
|
|
text = end_bracket_space_punc_regex.sub(r")\1", text)
|
|
text = digit_space_percent.sub(r"\1%", text)
|
|
text = double_quot_punc.sub(r'\1"', text)
|
|
text = digit_nbsp_digit.sub(r"\1.\2", text)
|
|
return text.strip(' ') |