Text-to-Audio / text /g2p_module.py
yuancwang
init
5548515
raw
history blame
7.72 kB
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import re
from g2p_en import G2p
from string import punctuation
from typing import Any, Dict, List, Optional, Pattern, Union
from phonemizer.backend import EspeakBackend
from phonemizer.backend.espeak.language_switch import LanguageSwitch
from phonemizer.backend.espeak.words_mismatch import WordMismatch
from phonemizer.punctuation import Punctuation
from phonemizer.separator import Separator
try:
from pypinyin import Style, pinyin
from pypinyin.style._utils import get_finals, get_initials
except Exception:
pass
# This code is modified from
# https://github.com/lifeiteng/vall-e/blob/9c69096d603ce13174fb5cb025f185e2e9b36ac7/valle/data/tokenizer.py
class PypinyinBackend:
"""PypinyinBackend for Chinese. Most codes is referenced from espnet.
There are two types pinyin or initials_finals, one is
just like "ni1 hao3", the other is like "n i1 h ao3".
"""
def __init__(
self,
backend="initials_finals",
punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(),
) -> None:
self.backend = backend
self.punctuation_marks = punctuation_marks
def phonemize(
self, text: List[str], separator: Separator, strip=True, njobs=1
) -> List[str]:
assert isinstance(text, List)
phonemized = []
for _text in text:
_text = re.sub(" +", " ", _text.strip())
_text = _text.replace(" ", separator.word)
phones = []
if self.backend == "pypinyin":
for n, py in enumerate(
pinyin(_text, style=Style.TONE3, neutral_tone_with_five=True)
):
if all([c in self.punctuation_marks for c in py[0]]):
if len(phones):
assert phones[-1] == separator.syllable
phones.pop(-1)
phones.extend(list(py[0]))
else:
phones.extend([py[0], separator.syllable])
elif self.backend == "pypinyin_initials_finals":
for n, py in enumerate(
pinyin(_text, style=Style.TONE3, neutral_tone_with_five=True)
):
if all([c in self.punctuation_marks for c in py[0]]):
if len(phones):
assert phones[-1] == separator.syllable
phones.pop(-1)
phones.extend(list(py[0]))
else:
if py[0][-1].isalnum():
initial = get_initials(py[0], strict=False)
if py[0][-1].isdigit():
final = get_finals(py[0][:-1], strict=False) + py[0][-1]
else:
final = get_finals(py[0], strict=False)
phones.extend(
[
initial,
separator.phone,
final,
separator.syllable,
]
)
else:
assert ValueError
else:
raise NotImplementedError
phonemized.append(
"".join(phones).rstrip(f"{separator.word}{separator.syllable}")
)
return phonemized
class G2PModule:
"""Phonemize Text."""
def __init__(
self,
language="en-us",
backend="espeak",
separator=Separator(word="_", syllable="-", phone="|"),
preserve_punctuation=True,
punctuation_marks: Union[str, Pattern] = Punctuation.default_marks(),
with_stress: bool = False,
tie: Union[bool, str] = False,
language_switch: LanguageSwitch = "keep-flags",
words_mismatch: WordMismatch = "ignore",
) -> None:
self.backend = self._initialize_backend(
backend,
language,
punctuation_marks,
preserve_punctuation,
with_stress,
tie,
language_switch,
words_mismatch,
)
self.separator = separator
def _initialize_backend(
self,
backend,
language,
punctuation_marks,
preserve_punctuation,
with_stress,
tie,
language_switch,
words_mismatch,
):
if backend == "espeak":
return EspeakBackend(
language,
punctuation_marks=punctuation_marks,
preserve_punctuation=preserve_punctuation,
with_stress=with_stress,
tie=tie,
language_switch=language_switch,
words_mismatch=words_mismatch,
)
elif backend in ["pypinyin", "pypinyin_initials_finals"]:
return PypinyinBackend(
backend=backend,
punctuation_marks=punctuation_marks + self.separator.word,
)
else:
raise NotImplementedError(f"{backend}")
def to_list(self, phonemized: str) -> List[str]:
fields = []
for word in phonemized.split(self.separator.word):
pp = re.findall(r"\w+|[^\w\s]", word, re.UNICODE)
fields.extend(
[p for p in pp if p != self.separator.phone] + [self.separator.word]
)
assert len("".join(fields[:-1])) == len(phonemized) - phonemized.count(
self.separator.phone
)
return fields[:-1]
def phonemization(self, text, strip=True) -> List[List[str]]:
if isinstance(text, str):
text = [text]
phonemized = self.backend.phonemize(
text, separator=self.separator, strip=strip, njobs=1
)
phonemes = [self.to_list(p) for p in phonemized]
return phonemes
def g2p_conversion(self, text: str) -> List[str]:
phonemes = self.phonemization([text.strip()])
return phonemes[0]
class LexiconModule:
def __init__(self, lex_path, language="en-us") -> None:
# todo: check lexicon derivation, merge with G2PModule?
lexicon = {}
with open(lex_path) as f:
for line in f:
temp = re.split(r"\s+", line.strip("\n"))
word = temp[0]
phones = temp[1:]
if word.lower() not in lexicon:
lexicon[word.lower()] = phones
self.lexicon = lexicon
self.language = language
self.lang2g2p = {"en-us": G2p()}
def g2p_conversion(self, text):
phone = None
# todo: preprocess with other languages
if self.language == "en-us":
phone = self.preprocess_english(text)
else:
print("No support to", self.language)
raise
return phone
def preprocess_english(self, text):
text = text.rstrip(punctuation)
g2p = self.lang2g2p["en-us"]
phones = []
words = re.split(r"([,;.\-\?\!\s+])", text)
for w in words:
if w.lower() in self.lexicon:
phones += self.lexicon[w.lower()]
else:
phones += list(filter(lambda p: p != " ", g2p(w)))
phones = "}{".join(phones)
phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
phones = phones.replace("}{", " ")
return phones