Text-to-Speech
English
hexgrad commited on
Commit
d53ec79
·
verified ·
1 Parent(s): b869fc9

Upload kokoro.py

Browse files
Files changed (1) hide show
  1. kokoro.py +11 -7
kokoro.py CHANGED
@@ -86,18 +86,22 @@ VOCAB = get_vocab()
86
  def tokenize(ps):
87
  return [i for i in map(VOCAB.get, ps) if i is not None]
88
 
89
- en_us = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
90
- def phonemize(text, norm=True):
 
 
 
91
  if norm:
92
  text = normalize_text(text)
93
- ps = en_us.phonemize([text])
94
  ps = ps[0] if ps else ''
95
  # https://en.wiktionary.org/wiki/kokoro#English
96
  ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
97
  ps = ps.replace('ʲ', 'j').replace('r', 'ɹ').replace('x', 'k').replace('ɬ', 'l')
98
  ps = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', ps)
99
  ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', 'z', ps)
100
- ps = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', ps)
 
101
  ps = ''.join(filter(lambda p: p in VOCAB, ps))
102
  return ps.strip()
103
 
@@ -131,8 +135,8 @@ def forward(model, tokens, ref_s, speed):
131
  asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
132
  return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
133
 
134
- def generate(model, text, voicepack, speed=1, ps=None):
135
- ps = ps or phonemize(text)
136
  tokens = tokenize(ps)
137
  if not tokens:
138
  return None
@@ -142,4 +146,4 @@ def generate(model, text, voicepack, speed=1, ps=None):
142
  ref_s = voicepack[len(tokens)]
143
  out = forward(model, tokens, ref_s, speed)
144
  ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
145
- return out, ps
 
86
  def tokenize(ps):
87
  return [i for i in map(VOCAB.get, ps) if i is not None]
88
 
89
+ phonemizers = dict(
90
+ a=phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
91
+ b=phonemizer.backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
92
+ )
93
+ def phonemize(text, lang, norm=True):
94
  if norm:
95
  text = normalize_text(text)
96
+ ps = phonemizers[lang].phonemize([text])
97
  ps = ps[0] if ps else ''
98
  # https://en.wiktionary.org/wiki/kokoro#English
99
  ps = ps.replace('kəkˈoːɹoʊ', 'kˈoʊkəɹoʊ').replace('kəkˈɔːɹəʊ', 'kˈəʊkəɹəʊ')
100
  ps = ps.replace('ʲ', 'j').replace('r', 'ɹ').replace('x', 'k').replace('ɬ', 'l')
101
  ps = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', ps)
102
  ps = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', 'z', ps)
103
+ if lang == 'a':
104
+ ps = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', ps)
105
  ps = ''.join(filter(lambda p: p in VOCAB, ps))
106
  return ps.strip()
107
 
 
135
  asr = t_en @ pred_aln_trg.unsqueeze(0).to(device)
136
  return model.decoder(asr, F0_pred, N_pred, ref_s[:, :128]).squeeze().cpu().numpy()
137
 
138
+ def generate(model, text, voicepack, lang='a', speed=1):
139
+ ps = phonemize(text, lang)
140
  tokens = tokenize(ps)
141
  if not tokens:
142
  return None
 
146
  ref_s = voicepack[len(tokens)]
147
  out = forward(model, tokens, ref_s, speed)
148
  ps = ''.join(next(k for k, v in VOCAB.items() if i == v) for i in tokens)
149
+ return out, ps