Spaces:
Sleeping
Sleeping
File size: 6,378 Bytes
01e655b 84cfd61 01e655b 84cfd61 01e655b 84cfd61 01e655b 84cfd61 01e655b 84cfd61 01e655b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 |
from modules.utils.zh_normalization.text_normlization import *
import emojiswitch
from modules.utils.markdown import markdown_to_text
post_normalize_pipeline = []
pre_normalize_pipeline = []
def post_normalize():
def decorator(func):
post_normalize_pipeline.append(func)
return func
return decorator
def pre_normalize():
def decorator(func):
pre_normalize_pipeline.append(func)
return func
return decorator
def apply_pre_normalize(text):
for func in pre_normalize_pipeline:
text = func(text)
return text
def apply_post_normalize(text):
for func in post_normalize_pipeline:
text = func(text)
return text
def is_markdown(text):
markdown_patterns = [
r"(^|\s)#[^#]", # 标题
r"\*\*.*?\*\*", # 加粗
r"\*.*?\*", # 斜体
r"!\[.*?\]\(.*?\)", # 图片
r"\[.*?\]\(.*?\)", # 链接
r"`[^`]+`", # 行内代码
r"```[\s\S]*?```", # 代码块
r"(^|\s)\* ", # 无序列表
r"(^|\s)\d+\. ", # 有序列表
r"(^|\s)> ", # 引用
r"(^|\s)---", # 分隔线
]
for pattern in markdown_patterns:
if re.search(pattern, text, re.MULTILINE):
return True
return False
character_map = {
":": ",",
";": ",",
"!": "。",
"(": ",",
")": ",",
"【": ",",
"】": ",",
"『": ",",
"』": ",",
"「": ",",
"」": ",",
"《": ",",
"》": ",",
"-": ",",
"‘": " ",
"“": " ",
"’": " ",
"”": " ",
'"': " ",
"'": " ",
":": ",",
";": ",",
"!": ".",
"(": ",",
")": ",",
"[": ",",
"]": ",",
">": ",",
"<": ",",
"-": ",",
}
character_to_word = {
" & ": " and ",
}
@post_normalize()
def apply_character_to_word(text):
for k, v in character_to_word.items():
text = text.replace(k, v)
return text
@post_normalize()
def apply_character_map(text):
translation_table = str.maketrans(character_map)
return text.translate(translation_table)
@post_normalize()
def apply_emoji_map(text):
return emojiswitch.demojize(text, delimiters=("", ""), lang="zh")
@post_normalize()
def insert_spaces_between_uppercase(s):
# 使用正则表达式在每个相邻的大写字母之间插入空格
return re.sub(
r"(?<=[A-Z])(?=[A-Z])|(?<=[a-z])(?=[A-Z])|(?<=[\u4e00-\u9fa5])(?=[A-Z])|(?<=[A-Z])(?=[\u4e00-\u9fa5])",
" ",
s,
)
@pre_normalize()
def apply_markdown_to_text(text):
if is_markdown(text):
text = markdown_to_text(text)
return text
# 将 "xxx" => \nxxx\n
# 将 'xxx' => \nxxx\n
@pre_normalize()
def replace_quotes(text):
repl = r"\n\1\n"
patterns = [
['"', '"'],
["'", "'"],
["“", "”"],
["‘", "’"],
]
for p in patterns:
text = re.sub(rf"({p[0]}[^{p[0]}{p[1]}]+?{p[1]})", repl, text)
return text
def ensure_suffix(a: str, b: str, c: str):
a = a.strip()
if not a.endswith(b):
a += c
return a
email_domain_map = {
"outlook.com": "Out look",
"hotmail.com": "Hot mail",
"yahoo.com": "雅虎",
}
# 找到所有 email 并将 name 分割为单个字母,@替换为 at ,. 替换为 dot,常见域名替换为单词
#
# 例如:
# zhzluke96@outlook.com => z h z l u k e 9 6 at out look dot com
def email_detect(text):
email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")
def replace(match):
email = match.group(1)
name, domain = email.split("@")
name = " ".join(name)
if domain in email_domain_map:
domain = email_domain_map[domain]
domain = domain.replace(".", " dot ")
return f"{name} at {domain}"
return email_pattern.sub(replace, text)
def sentence_normalize(sentence_text: str):
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
tx = TextNormalizer()
# 匹配 \[.+?\] 的部分
pattern = re.compile(r"(\[.+?\])|([^[]+)")
def normalize_part(part):
sentences = tx.normalize(part)
dest_text = ""
for sentence in sentences:
sentence = apply_post_normalize(sentence)
dest_text += sentence
return dest_text
def replace(match):
if match.group(1):
return f" {match.group(1)} "
else:
return normalize_part(match.group(2))
result = pattern.sub(replace, sentence_text)
# NOTE: 加了会有杂音...
# if is_end:
# 加这个是为了防止吞字
# result = ensure_suffix(result, "[uv_break]", "。。。[uv_break]。。。")
return result
def text_normalize(text, is_end=False):
text = apply_pre_normalize(text)
lines = text.split("\n")
lines = [line.strip() for line in lines]
lines = [line for line in lines if line]
lines = [sentence_normalize(line) for line in lines]
content = "\n".join(lines)
return content
if __name__ == "__main__":
test_cases = [
"ChatTTS是专门为对话场景设计的文本转语音模型,例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本.",
" [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149",
" 明天有62%的概率降雨",
"大🍌,一条大🍌,嘿,你的感觉真的很奇妙 [lbreak]",
"""
# 你好,世界
```js
console.log('1')
```
**加粗**
*一条文本*
""",
"""
在沙漠、岩石、雪地上行走了很长的时间以后,小王子终于发现了一条大路。所有的大路都是通往人住的地方的。
“你们好。”小王子说。
这是一个玫瑰盛开的花园。
“你好。”玫瑰花说道。
小王子瞅着这些花,它们全都和他的那朵花一样。
“你们是什么花?”小王子惊奇地问。
“我们是玫瑰花。”花儿们说道。
“啊!”小王子说……。
""",
]
for i, test_case in enumerate(test_cases):
print(f"case {i}:\n", {"x": text_normalize(test_case, is_end=True)})
|