import re import cn2an from style_bert_vits2.nlp.symbols import PUNCTUATIONS __REPLACE_MAP = { ":": ",", ";": ",", ",": ",", "。": ".", "!": "!", "?": "?", "\n": ".", "·": ",", "、": ",", "...": "…", "$": ".", "“": "'", "”": "'", '"': "'", "‘": "'", "’": "'", "(": "'", ")": "'", "(": "'", ")": "'", "《": "'", "》": "'", "【": "'", "】": "'", "[": "'", "]": "'", "—": "-", "~": "-", "~": "-", "「": "'", "」": "'", } def normalize_text(text: str) -> str: numbers = re.findall(r"\d+(?:\.?\d+)?", text) for number in numbers: text = text.replace(number, cn2an.an2cn(number), 1) text = replace_punctuation(text) return text def replace_punctuation(text: str) -> str: text = text.replace("嗯", "恩").replace("呣", "母") pattern = re.compile("|".join(re.escape(p) for p in __REPLACE_MAP)) replaced_text = pattern.sub(lambda x: __REPLACE_MAP[x.group()], text) replaced_text = re.sub( r"[^\u4e00-\u9fa5" + "".join(PUNCTUATIONS) + r"]+", "", replaced_text ) return replaced_text