File size: 9,168 Bytes
d2b7e94 f34bda5 d2b7e94 01e655b d2b7e94 02e90e4 d2b7e94 02e90e4 ebc4336 02e90e4 f34bda5 02e90e4 01e655b f34bda5 01e655b 84cfd61 01e655b 84cfd61 01e655b 02e90e4 1df74c6 01e655b 02e90e4 01e655b f34bda5 02e90e4 01e655b 02e90e4 ebc4336 02e90e4 d8e7d56 f34bda5 d8e7d56 02e90e4 84cfd61 01e655b f34bda5 01e655b 84cfd61 01e655b 1df74c6 01e655b 84cfd61 02e90e4 b473486 1df74c6 b473486 01e655b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 |
import re
from functools import lru_cache
import emojiswitch
from modules import models
from modules.utils.markdown import markdown_to_text
from modules.utils.zh_normalization.text_normlization import *
# 是否关闭 unk token 检查
# NOTE: 单测的时候用于跳过模型加载
DISABLE_UNK_TOKEN_CHECK = False
@lru_cache(maxsize=64)
def is_chinese(text):
# 中文字符的 Unicode 范围是 \u4e00-\u9fff
chinese_pattern = re.compile(r"[\u4e00-\u9fff]")
return bool(chinese_pattern.search(text))
@lru_cache(maxsize=64)
def is_eng(text):
eng_pattern = re.compile(r"[a-zA-Z]")
return bool(eng_pattern.search(text))
@lru_cache(maxsize=64)
def guess_lang(text):
if is_chinese(text):
return "zh"
if is_eng(text):
return "en"
return "zh"
post_normalize_pipeline = []
pre_normalize_pipeline = []
def post_normalize():
def decorator(func):
post_normalize_pipeline.append(func)
return func
return decorator
def pre_normalize():
def decorator(func):
pre_normalize_pipeline.append(func)
return func
return decorator
def apply_pre_normalize(text):
for func in pre_normalize_pipeline:
text = func(text)
return text
def apply_post_normalize(text):
for func in post_normalize_pipeline:
text = func(text)
return text
def is_markdown(text):
markdown_patterns = [
r"(^|\s)#[^#]", # 标题
r"\*\*.*?\*\*", # 加粗
r"\*.*?\*", # 斜体
r"!\[.*?\]\(.*?\)", # 图片
r"\[.*?\]\(.*?\)", # 链接
r"`[^`]+`", # 行内代码
r"```[\s\S]*?```", # 代码块
r"(^|\s)\* ", # 无序列表
r"(^|\s)\d+\. ", # 有序列表
r"(^|\s)> ", # 引用
r"(^|\s)---", # 分隔线
]
for pattern in markdown_patterns:
if re.search(pattern, text, re.MULTILINE):
return True
return False
character_map = {
":": ",",
";": ",",
"!": "。",
"(": ",",
")": ",",
"【": ",",
"】": ",",
"『": ",",
"』": ",",
"「": ",",
"」": ",",
"《": ",",
"》": ",",
"-": ",",
"‘": " ",
"“": " ",
"’": " ",
"”": " ",
'"': " ",
"'": " ",
":": ",",
";": ",",
"!": ".",
"(": ",",
")": ",",
"[": ",",
"]": ",",
">": ",",
"<": ",",
"-": ",",
"~": " ",
"~": " ",
"/": " ",
"·": " ",
}
character_to_word = {
" & ": " and ",
}
## ---------- post normalize ----------
@post_normalize()
def apply_character_to_word(text):
for k, v in character_to_word.items():
text = text.replace(k, v)
return text
@post_normalize()
def apply_character_map(text):
translation_table = str.maketrans(character_map)
return text.translate(translation_table)
@post_normalize()
def apply_emoji_map(text):
lang = guess_lang(text)
return emojiswitch.demojize(text, delimiters=("", ""), lang=lang)
@post_normalize()
def insert_spaces_between_uppercase(s):
# 使用正则表达式在每个相邻的大写字母之间插入空格
return re.sub(
r"(?<=[A-Z])(?=[A-Z])|(?<=[a-z])(?=[A-Z])|(?<=[\u4e00-\u9fa5])(?=[A-Z])|(?<=[A-Z])(?=[\u4e00-\u9fa5])",
" ",
s,
)
@post_normalize()
def replace_unk_tokens(text):
"""
把不在字典里的字符替换为 " , "
"""
if DISABLE_UNK_TOKEN_CHECK:
return text
chat_tts = models.load_chat_tts()
if "tokenizer" not in chat_tts.pretrain_models:
# 这个地方只有在 huggingface spaces 中才会触发
# 因为 hugggingface 自动处理模型卸载加载,所以如果拿不到就算了...
return text
tokenizer = chat_tts.pretrain_models["tokenizer"]
vocab = tokenizer.get_vocab()
vocab_set = set(vocab.keys())
# 添加所有英语字符
vocab_set.update(set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"))
vocab_set.update(set(" \n\r\t"))
replaced_chars = [char if char in vocab_set else " , " for char in text]
output_text = "".join(replaced_chars)
return output_text
## ---------- pre normalize ----------
@pre_normalize()
def apply_markdown_to_text(text):
if is_markdown(text):
text = markdown_to_text(text)
return text
# 将 "xxx" => \nxxx\n
# 将 'xxx' => \nxxx\n
@pre_normalize()
def replace_quotes(text):
repl = r"\n\1\n"
patterns = [
['"', '"'],
["'", "'"],
["“", "”"],
["‘", "’"],
]
for p in patterns:
text = re.sub(rf"({p[0]}[^{p[0]}{p[1]}]+?{p[1]})", repl, text)
return text
def ensure_suffix(a: str, b: str, c: str):
a = a.strip()
if not a.endswith(b):
a += c
return a
email_domain_map = {
"outlook.com": "Out look",
"hotmail.com": "Hot mail",
"yahoo.com": "雅虎",
}
# 找到所有 email 并将 name 分割为单个字母,@替换为 at ,. 替换为 dot,常见域名替换为单词
#
# 例如:
# zhzluke96@outlook.com => z h z l u k e 9 6 at out look dot com
def email_detect(text):
email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")
def replace(match):
email = match.group(1)
name, domain = email.split("@")
name = " ".join(name)
if domain in email_domain_map:
domain = email_domain_map[domain]
domain = domain.replace(".", " dot ")
return f"{name} at {domain}"
return email_pattern.sub(replace, text)
def sentence_normalize(sentence_text: str):
# https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
tx = TextNormalizer()
# 匹配 \[.+?\] 的部分
pattern = re.compile(r"(\[.+?\])|([^[]+)")
def normalize_part(part):
sentences = tx.normalize(part) if guess_lang(part) == "zh" else [part]
dest_text = ""
for sentence in sentences:
sentence = apply_post_normalize(sentence)
dest_text += sentence
return dest_text
def replace(match):
if match.group(1):
return f" {match.group(1)} "
else:
return normalize_part(match.group(2))
result = pattern.sub(replace, sentence_text)
# NOTE: 加了会有杂音...
# if is_end:
# 加这个是为了防止吞字
# result = ensure_suffix(result, "[uv_break]", "。。。[uv_break]。。。")
return result
def text_normalize(text, is_end=False):
text = apply_pre_normalize(text)
lines = text.split("\n")
lines = [line.strip() for line in lines]
lines = [line for line in lines if line]
lines = [sentence_normalize(line) for line in lines]
content = "\n".join(lines)
return content
if __name__ == "__main__":
from modules.devices import devices
devices.reset_device()
test_cases = [
"ChatTTS是专门为对话场景设计的文本转语音模型,例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本.",
" [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149",
" 明天有62%的概率降雨",
"大🍌,一条大🍌,嘿,你的感觉真的很奇妙 [lbreak]",
"""
# 你好,世界
```js
console.log('1')
```
**加粗**
*一条文本*
""",
"""
在沙漠、岩石、雪地上行走了很长的时间以后,小王子终于发现了一条大路。所有的大路都是通往人住的地方的。
“你们好。”小王子说。
这是一个玫瑰盛开的花园。
“你好。”玫瑰花说道。
小王子瞅着这些花,它们全都和他的那朵花一样。
“你们是什么花?”小王子惊奇地问。
“我们是玫瑰花。”花儿们说道。
“啊!”小王子说……。
""",
"""
State-of-the-art Machine Learning for PyTorch, TensorFlow, and JAX.
🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as:
📝 Natural Language Processing: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation.
🖼️ Computer Vision: image classification, object detection, and segmentation.
🗣️ Audio: automatic speech recognition and audio classification.
🐙 Multimodal: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
""",
"""
120米
有12%的概率会下雨
埃隆·马斯克
""",
]
for i, test_case in enumerate(test_cases):
print(f"case {i}:\n", {"x": text_normalize(test_case, is_end=True)})
|