File size: 1,726 Bytes
98f685a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import re
import jieba
from pypinyin import pinyin, Style
from data_gen.tts.data_gen_utils import PUNCS
from data_gen.tts.txt_processors.base_text_processor import BaseTxtProcessor
from utils.text_norm import NSWNormalizer


class TxtProcessor(BaseTxtProcessor):
    table = {ord(f): ord(t) for f, t in zip(
        u':,。!?【】()%#@&1234567890',
        u':,.!?[]()%#@&1234567890')}

    @staticmethod
    def preprocess_text(text):
        text = text.translate(TxtProcessor.table)
        text = NSWNormalizer(text).normalize(remove_punc=False)
        text = re.sub("[\'\"()]+", "", text)
        text = re.sub("[-]+", " ", text)
        text = re.sub(f"[^ A-Za-z\u4e00-\u9fff{PUNCS}]", "", text)
        text = re.sub(f"([{PUNCS}])+", r"\1", text)  # !! -> !
        text = re.sub(f"([{PUNCS}])", r" \1 ", text)
        text = re.sub(rf"\s+", r"", text)
        text = re.sub(rf"[A-Za-z]+", r"$", text)
        return text

    @classmethod
    def process(cls, txt, pre_align_args):
        txt = cls.preprocess_text(txt)
        shengmu = pinyin(txt, style=Style.INITIALS)  # https://blog.csdn.net/zhoulei124/article/details/89055403
        yunmu_finals = pinyin(txt, style=Style.FINALS)
        yunmu_tone3 = pinyin(txt, style=Style.FINALS_TONE3)
        yunmu = [[t[0] + '5'] if t[0] == f[0] else t for f, t in zip(yunmu_finals, yunmu_tone3)] \
            if pre_align_args['use_tone'] else yunmu_finals

        assert len(shengmu) == len(yunmu)
        phs = ["|"]
        for a, b, c in zip(shengmu, yunmu, yunmu_finals):
            if a[0] == c[0]:
                phs += [a[0], "|"]
            else:
                phs += [a[0], b[0], "|"]
        return phs, txt