import re from g2p_en import G2p from style_bert_vits2.constants import Languages from style_bert_vits2.nlp import bert_models from style_bert_vits2.nlp.english.cmudict import get_dict from style_bert_vits2.nlp.symbols import PUNCTUATIONS, SYMBOLS # Initialize global variables once ARPA = { "AH0", "S", "AH1", "EY2", "AE2", "EH0", "OW2", "UH0", "NG", "B", "G", "AY0", "M", "AA0", "F", "AO0", "ER2", "UH1", "IY1", "AH2", "DH", "IY0", "EY1", "IH0", "K", "N", "W", "IY2", "T", "AA1", "ER1", "EH2", "OY0", "UH2", "UW1", "Z", "AW2", "AW1", "V", "UW2", "AA2", "ER", "AW0", "UW0", "R", "OW1", "EH1", "ZH", "AE0", "IH2", "IH", "Y", "JH", "P", "AY1", "EY0", "OY2", "TH", "HH", "D", "ER0", "CH", "AO1", "AE1", "AO2", "OY1", "AY2", "IH1", "OW0", "L", "SH", } _g2p = G2p() eng_dict = get_dict() def g2p(text: str) -> tuple[list[str], list[int], list[int]]: phones = [] tones = [] phone_len = [] words = __text_to_words(text) for word in words: temp_phones, temp_tones = [], [] if len(word) > 1 and "'" in word: word = ["".join(word)] for w in word: if w in PUNCTUATIONS: temp_phones.append(w) temp_tones.append(0) continue if w.upper() in eng_dict: phns, tns = __refine_syllables(eng_dict[w.upper()]) temp_phones += [__post_replace_ph(i) for i in phns] temp_tones += tns else: phone_list = list(filter(lambda p: p != " ", _g2p(w))) phns, tns = [], [] for ph in phone_list: if ph in ARPA: ph, tn = __refine_ph(ph) phns.append(ph) tns.append(tn) else: phns.append(ph) tns.append(0) temp_phones += [__post_replace_ph(i) for i in phns] temp_tones += tns phones += temp_phones tones += temp_tones phone_len.append(len(temp_phones)) word2ph = [] for token, pl in zip(words, phone_len): word_len = len(token) word2ph += __distribute_phone(pl, word_len) phones = ["_"] + phones + ["_"] tones = [0] + tones + [0] word2ph = [1] + word2ph + [1] assert len(phones) == len(tones), text assert len(phones) == sum(word2ph), text return phones, tones, word2ph def __post_replace_ph(ph: str) -> str: REPLACE_MAP = { ":": ",", ";": ",", ",": ",", "。": ".", "!": "!", "?": "?", "\n": ".", "·": ",", "、": ",", "…": "...", "···": "...", "・・・": "...", "v": "V", } if ph in REPLACE_MAP: ph = REPLACE_MAP[ph] if ph in SYMBOLS: return ph return "UNK" def __refine_ph(phn: str) -> tuple[str, int]: tone = 0 if re.search(r"\d$", phn): tone = int(phn[-1]) + 1 phn = phn[:-1] else: tone = 3 return phn.lower(), tone def __refine_syllables(syllables: list[list[str]]) -> tuple[list[str], list[int]]: tones = [] phonemes = [] for phn_list in syllables: for phn in phn_list: phn, tone = __refine_ph(phn) phonemes.append(phn) tones.append(tone) return phonemes, tones def __distribute_phone(n_phone: int, n_word: int) -> list[int]: phones_per_word = [0] * n_word for task in range(n_phone): min_tasks = min(phones_per_word) min_index = phones_per_word.index(min_tasks) phones_per_word[min_index] += 1 return phones_per_word def __text_to_words(text: str) -> list[list[str]]: tokenizer = bert_models.load_tokenizer(Languages.EN) tokens = tokenizer.tokenize(text) words = [] for idx, t in enumerate(tokens): if t.startswith("▁"): words.append([t[1:]]) elif t in PUNCTUATIONS: if idx == len(tokens) - 1: words.append([f"{t}"]) elif ( not tokens[idx + 1].startswith("▁") and tokens[idx + 1] not in PUNCTUATIONS ): if idx == 0: words.append([]) words[-1].append(f"{t}") else: words.append([f"{t}"]) else: if idx == 0: words.append([]) words[-1].append(f"{t}") return words if __name__ == "__main__": # print(get_dict()) # print(eng_word_to_phoneme("hello")) print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder.")) # all_phones = set() # eng_dict = get_dict() # for k, syllables in eng_dict.items(): # for group in syllables: # for ph in group: # all_phones.add(ph) # print(all_phones)