Spaces:

zhengr
/

ChatTTS-Forge

Sleeping

File size: 3,773 Bytes

01e655b

from modules.utils.zh_normalization.text_normlization import *

character_map = {
    "：": "，",
    "；": "，",
    "！": "。",
    "（": "，",
    "）": "，",
    "【": "，",
    "】": "，",
    "『": "，",
    "』": "，",
    "「": "，",
    "」": "，",
    "《": "，",
    "》": "，",
    "－": "，",
    "‘": " ",
    "“": " ",
    "’": " ",
    "”": " ",
    ":": ",",
    ";": ",",
    "!": ".",
    "(": ",",
    ")": ",",
    # '[': ',',
    # ']': ',',
    ">": ",",
    "<": ",",
    "-": ",",
}

character_to_word = {
    " & ": " and ",
}


def apply_character_to_word(text):
    for k, v in character_to_word.items():
        text = text.replace(k, v)
    return text


def apply_character_map(text):
    translation_table = str.maketrans(character_map)
    return text.translate(translation_table)


def insert_spaces_between_uppercase(s):
    # 使用正则表达式在每个相邻的大写字母之间插入空格
    return re.sub(
        r"(?<=[A-Z])(?=[A-Z])|(?<=[a-z])(?=[A-Z])|(?<=[\u4e00-\u9fa5])(?=[A-Z])|(?<=[A-Z])(?=[\u4e00-\u9fa5])",
        " ",
        s,
    )


def ensure_suffix(a: str, b: str, c: str):
    a = a.strip()
    if not a.endswith(b):
        a += c
    return a


email_domain_map = {
    "outlook.com": "Out look",
    "hotmail.com": "Hot mail",
    "yahoo.com": "雅虎",
}


# 找到所有 email 并将 name 分割为单个字母，@替换为 at ，. 替换为 dot，常见域名替换为单词
#
# 例如:
# zhzluke96@outlook.com => z h z l u k e 9 6 at out look dot com
def email_detect(text):
    email_pattern = re.compile(r"([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")

    def replace(match):
        email = match.group(1)
        name, domain = email.split("@")
        name = " ".join(name)
        if domain in email_domain_map:
            domain = email_domain_map[domain]
        domain = domain.replace(".", " dot ")
        return f"{name} at {domain}"

    return email_pattern.sub(replace, text)


def pre_normalize(text):
    # NOTE: 效果一般...
    # text = email_detect(text)
    return text


def post_normalize(text):
    text = insert_spaces_between_uppercase(text)
    text = apply_character_map(text)
    text = apply_character_to_word(text)
    return text


def text_normalize(text, is_end=False):
    # https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/paddlespeech/t2s/frontend/zh_normalization
    tx = TextNormalizer()

    # 匹配 \[.+?\] 的部分
    pattern = re.compile(r"(\[.+?\])|([^[]+)")

    def normalize_part(part):
        part = pre_normalize(part)
        sentences = tx.normalize(part)
        dest_text = ""
        for sentence in sentences:
            dest_text += post_normalize(sentence)
        return dest_text

    def replace(match):
        if match.group(1):
            return f" {match.group(1)} "
        else:
            return normalize_part(match.group(2))

    result = pattern.sub(replace, text)

    # NOTE: 加了会有杂音...
    # if is_end:
    # 加这个是为了防止吞字
    # result = ensure_suffix(result, "[uv_break]", "。。。[uv_break]。。。")

    return result


if __name__ == "__main__":
    print(
        text_normalize(
            "ChatTTS是专门为对话场景设计的文本转语音模型，例如LLM助手对话任务。它支持英文和中文两种语言。最大的模型使用了10万小时以上的中英文数据进行训练。在HuggingFace中开源的版本为4万小时训练且未SFT的版本."
        )
    )
    print(
        text_normalize(
            " [oral_9] [laugh_0] [break_0] 电 [speed_0] 影 [speed_0] 中 梁朝伟 [speed_9] 扮演的陈永仁的编号27149"
        )
    )
    print(text_normalize(" 明天有62％的概率降雨"))