Spaces:
Sleeping
Sleeping
from modules.utils.zh_normalization.text_normlization import * | |
character_map = { | |
":": ",", | |
";": ",", | |
"!": "。", | |
"(": ",", | |
")": ",", | |
"【": ",", | |
"】": ",", | |
"『": ",", | |
"』": ",", | |
"「": ",", | |
"」": ",", | |
"《": ",", | |
"》": ",", | |
"-": ",", | |
"‘": " ", | |
"“": " ", | |
"’": " ", | |
"”": " ", | |
":": ",", | |
";": ",", | |
"!": ".", | |
"(": ",", | |
")": ",", | |
# '[': ',', | |
# ']': ',', | |
">": ",", | |
"<": ",", | |
"-": ",", | |
} | |
character_to_word = { | |
" & ": " and ", | |
} | |
def apply_character_to_word(text): | |
for k, v in character_to_word.items(): | |
text = text.replace(k, v) | |
return text | |
def apply_character_map(text): | |
translation_table = str.maketrans(character_map) | |
return text.translate(translation_table) | |
def insert_spaces_between_uppercase(s): | |
# 使用正则表达式在每个相邻的大写字母之间插入空格 | |
return re.sub( | |
r"(?<=[A-Z])(?=[A-Z])|(?<=[a-z])(?=[A-Z])|(?<=[\u4e00-\u9fa5])(?=[A-Z])|(?<=[A-Z])(?=[\u4e00-\u9fa5])", | |
" ", | |
s, | |
) | |
def ensure_suffix(a: str, b: str, c: str): | |
a = a.strip() | |
if not a.endswith(b): | |
a += c | |
return a | |
email_domain_map = { | |
"outlook.com": "Out look", | |
"hotmail.com": "Hot mail", | |
"yahoo.com": "雅虎", | |
} | |
# 找到所有 email 并将 name 分割为单个字母,@替换为 at ,. 替换为 dot,常见域名替换为单词 | |
# | |
# 例如: | |
# zhzluke96@outlook.com => z h z l u k e 9 6 at out look dot com | |
def email_detect(text): | |
email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})") | |
def replace(match): | |
email = match.group(1) | |
name, domain = email.split("@") | |
name = " ".join(name) | |
if domain in email_domain_map: | |
domain = email_domain_map[domain] | |
domain = domain.replace(".", " dot ") | |
return f"{name} at {domain}" | |
return email_pattern.sub(replace, text) | |
def pre_normalize(text): | |
# NOTE: 效果一般... | |
# text = email_detect(text) | |
return text | |
def post_normalize(text): | |
text = insert_spaces_between_uppercase(text) | |
text = apply_character_map(text) | |
text = apply_character_to_word(text) | |
return text | |
def text_normalize(text, is_end=False): | |
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization | |
tx = TextNormalizer() | |
# 匹配 \[.+?\] 的部分 | |
pattern = re.compile(r"(\[.+?\])|([^[]+)") | |
def normalize_part(part): | |
part = pre_normalize(part) | |
sentences = tx.normalize(part) | |
dest_text = "" | |
for sentence in sentences: | |
dest_text += post_normalize(sentence) | |
return dest_text | |
def replace(match): | |
if match.group(1): | |
return f" {match.group(1)} " | |
else: | |
return normalize_part(match.group(2)) | |
result = pattern.sub(replace, text) | |
# NOTE: 加了会有杂音... | |
# if is_end: | |
# 加这个是为了防止吞字 | |
# result = ensure_suffix(result, "[uv_break]", "。。。[uv_break]。。。") | |
return result | |
if __name__ == "__main__": | |
print( | |
text_normalize( | |
"ChatTTS是专门为对话场景设计的文本转语音模型,例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本." | |
) | |
) | |
print( | |
text_normalize( | |
" [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149" | |
) | |
) | |
print(text_normalize(" 明天有62%的概率降雨")) | |