|
import importlib |
|
from typing import List |
|
|
|
import gruut |
|
from gruut_ipa import IPA |
|
|
|
from .base import BasePhonemizer |
|
from .punctuation import Punctuation |
|
|
|
|
|
GRUUT_TRANS_TABLE = str.maketrans("g", "ɡ") |
|
|
|
|
|
class Gruut(BasePhonemizer): |
|
"""Gruut wrapper for G2P |
|
|
|
Args: |
|
language (str): |
|
Valid language code for the used backend. |
|
|
|
punctuations (str): |
|
Characters to be treated as punctuation. Defaults to `Punctuation.default_puncs()`. |
|
|
|
keep_puncs (bool): |
|
If true, keep the punctuations after phonemization. Defaults to True. |
|
|
|
use_espeak_phonemes (bool): |
|
If true, use espeak lexicons instead of default Gruut lexicons. Defaults to False. |
|
|
|
keep_stress (bool): |
|
If true, keep the stress characters after phonemization. Defaults to False. |
|
|
|
Example: |
|
|
|
>>> from TTS.tts.utils.text.phonemizers.gruut_wrapper import Gruut |
|
>>> phonemizer = Gruut('en-us') |
|
>>> phonemizer.phonemize("Be a voice, not an! echo?", separator="|") |
|
'b|i| ə| v|ɔ|ɪ|s, n|ɑ|t| ə|n! ɛ|k|o|ʊ?' |
|
""" |
|
|
|
def __init__( |
|
self, |
|
language: str, |
|
punctuations=Punctuation.default_puncs(), |
|
keep_puncs=True, |
|
use_espeak_phonemes=False, |
|
keep_stress=False, |
|
): |
|
super().__init__(language, punctuations=punctuations, keep_puncs=keep_puncs) |
|
self.use_espeak_phonemes = use_espeak_phonemes |
|
self.keep_stress = keep_stress |
|
|
|
@staticmethod |
|
def name(): |
|
return "gruut" |
|
|
|
def phonemize_gruut(self, text: str, separator: str = "|", tie=False) -> str: |
|
"""Convert input text to phonemes. |
|
|
|
Gruut phonemizes the given `str` by seperating each phoneme character with `separator`, even for characters |
|
that constitude a single sound. |
|
|
|
It doesn't affect 🐸TTS since it individually converts each character to token IDs. |
|
|
|
Examples:: |
|
"hello how are you today?" -> `h|ɛ|l|o|ʊ| h|a|ʊ| ɑ|ɹ| j|u| t|ə|d|e|ɪ` |
|
|
|
Args: |
|
text (str): |
|
Text to be converted to phonemes. |
|
|
|
tie (bool, optional) : When True use a '͡' character between |
|
consecutive characters of a single phoneme. Else separate phoneme |
|
with '_'. This option requires espeak>=1.49. Default to False. |
|
""" |
|
ph_list = [] |
|
for sentence in gruut.sentences(text, lang=self.language, espeak=self.use_espeak_phonemes): |
|
for word in sentence: |
|
if word.is_break: |
|
|
|
if ph_list: |
|
|
|
ph_list[-1].append(word.text) |
|
else: |
|
|
|
ph_list.append([word.text]) |
|
elif word.phonemes: |
|
|
|
word_phonemes = [] |
|
|
|
for word_phoneme in word.phonemes: |
|
if not self.keep_stress: |
|
|
|
word_phoneme = IPA.without_stress(word_phoneme) |
|
|
|
word_phoneme = word_phoneme.translate(GRUUT_TRANS_TABLE) |
|
|
|
if word_phoneme: |
|
|
|
word_phonemes.extend(word_phoneme) |
|
|
|
if word_phonemes: |
|
ph_list.append(word_phonemes) |
|
|
|
ph_words = [separator.join(word_phonemes) for word_phonemes in ph_list] |
|
ph = f"{separator} ".join(ph_words) |
|
return ph |
|
|
|
def _phonemize(self, text, separator): |
|
return self.phonemize_gruut(text, separator, tie=False) |
|
|
|
def is_supported_language(self, language): |
|
"""Returns True if `language` is supported by the backend""" |
|
return gruut.is_language_supported(language) |
|
|
|
@staticmethod |
|
def supported_languages() -> List: |
|
"""Get a dictionary of supported languages. |
|
|
|
Returns: |
|
List: List of language codes. |
|
""" |
|
return list(gruut.get_supported_languages()) |
|
|
|
def version(self): |
|
"""Get the version of the used backend. |
|
|
|
Returns: |
|
str: Version of the used backend. |
|
""" |
|
return gruut.__version__ |
|
|
|
@classmethod |
|
def is_available(cls): |
|
"""Return true if ESpeak is available else false""" |
|
return importlib.util.find_spec("gruut") is not None |
|
|
|
|
|
if __name__ == "__main__": |
|
from cleaner import french_cleaners |
|
import json |
|
|
|
e = Gruut(language="fr-fr", keep_puncs=True, keep_stress=True, use_espeak_phonemes=True) |
|
symbols = [ |
|
"_", |
|
",", |
|
".", |
|
"!", |
|
"?", |
|
"-", |
|
"~", |
|
"\u2026", |
|
"N", |
|
"Q", |
|
"a", |
|
"b", |
|
"d", |
|
"e", |
|
"f", |
|
"g", |
|
"h", |
|
"i", |
|
"j", |
|
"k", |
|
"l", |
|
"m", |
|
"n", |
|
"o", |
|
"p", |
|
"s", |
|
"t", |
|
"u", |
|
"v", |
|
"w", |
|
"x", |
|
"y", |
|
"z", |
|
"\u0251", |
|
"\u00e6", |
|
"\u0283", |
|
"\u0291", |
|
"\u00e7", |
|
"\u026f", |
|
"\u026a", |
|
"\u0254", |
|
"\u025b", |
|
"\u0279", |
|
"\u00f0", |
|
"\u0259", |
|
"\u026b", |
|
"\u0265", |
|
"\u0278", |
|
"\u028a", |
|
"\u027e", |
|
"\u0292", |
|
"\u03b8", |
|
"\u03b2", |
|
"\u014b", |
|
"\u0266", |
|
"\u207c", |
|
"\u02b0", |
|
"`", |
|
"^", |
|
"#", |
|
"*", |
|
"=", |
|
"\u02c8", |
|
"\u02cc", |
|
"\u2192", |
|
"\u2193", |
|
"\u2191", |
|
" ", |
|
"ɣ", |
|
"ɡ", |
|
"r", |
|
"ɲ", |
|
"ʝ", |
|
"ʎ", |
|
"ː" |
|
] |
|
with open('/home/xumin/workspace/VITS-Training-Multiling/230715_fr/metadata.txt', 'r') as f: |
|
lines = f.readlines() |
|
|
|
|
|
used_sym = [] |
|
not_existed_sym = [] |
|
phonemes = [] |
|
|
|
for line in lines: |
|
text = line.split('|')[-1].strip() |
|
text = french_cleaners(text) |
|
ipa = e.phonemize(text, separator="") |
|
phonemes.append(ipa) |
|
for s in ipa: |
|
if s not in symbols: |
|
if s not in not_existed_sym: |
|
print(f'not_existed char: {s}') |
|
not_existed_sym.append(s) |
|
else: |
|
if s not in used_sym: |
|
|
|
used_sym.append(s) |
|
|
|
print(used_sym) |
|
print(not_existed_sym) |
|
|
|
|
|
with open('./text/fr_phonemizer/french_symbols.txt', 'w') as g: |
|
g.writelines(symbols + not_existed_sym) |
|
|
|
with open('./text/fr_phonemizer/example_ipa.txt', 'w') as g: |
|
g.writelines(phonemes) |
|
|
|
data = {'symbols': symbols + not_existed_sym} |
|
|
|
with open('./text/fr_phonemizer/fr_symbols.json', 'w') as f: |
|
json.dump(data, f, indent=4) |
|
|
|
|