add information for inference
#4
by
yingzhi
- opened
README.md
CHANGED
@@ -48,7 +48,13 @@ hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", saved
|
|
48 |
|
49 |
# Run TTS with text input
|
50 |
input_text = "were the leaders in this luckless change; though our own Baskerville; who was at work some years before them; went much on the same lines;"
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
# Running Vocoder (spectrogram-to-waveform)
|
54 |
waveforms = hifi_gan.decode_batch(mel_output)
|
@@ -59,7 +65,12 @@ torchaudio.save('example_TTS_input_text.wav', waveforms.squeeze(1), 22050)
|
|
59 |
|
60 |
# Run TTS with phoneme input
|
61 |
input_phonemes = ['W', 'ER', 'DH', 'AH', 'L', 'IY', 'D', 'ER', 'Z', 'IH', 'N', 'DH', 'IH', 'S', 'L', 'AH', 'K', 'L', 'AH', 'S', 'CH', 'EY', 'N', 'JH', 'spn', 'DH', 'OW', 'AW', 'ER', 'OW', 'N', 'B', 'AE', 'S', 'K', 'ER', 'V', 'IH', 'L', 'spn', 'HH', 'UW', 'W', 'AA', 'Z', 'AE', 'T', 'W', 'ER', 'K', 'S', 'AH', 'M', 'Y', 'IH', 'R', 'Z', 'B', 'IH', 'F', 'AO', 'R', 'DH', 'EH', 'M', 'spn', 'W', 'EH', 'N', 'T', 'M', 'AH', 'CH', 'AA', 'N', 'DH', 'AH', 'S', 'EY', 'M', 'L', 'AY', 'N', 'Z', 'spn']
|
62 |
-
mel_output, durations, pitch, energy = fastspeech2.encode_phoneme(
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
# Running Vocoder (spectrogram-to-waveform)
|
65 |
waveforms = hifi_gan.decode_batch(mel_output)
|
|
|
48 |
|
49 |
# Run TTS with text input
|
50 |
input_text = "were the leaders in this luckless change; though our own Baskerville; who was at work some years before them; went much on the same lines;"
|
51 |
+
|
52 |
+
mel_output, durations, pitch, energy = fastspeech2.encode_text(
|
53 |
+
[input_text],
|
54 |
+
pace=1.0, # scale up/down the speed
|
55 |
+
pitch_rate=1.0, # scale up/down the pitch
|
56 |
+
energy_rate=1.0, # scale up/down the energy
|
57 |
+
)
|
58 |
|
59 |
# Running Vocoder (spectrogram-to-waveform)
|
60 |
waveforms = hifi_gan.decode_batch(mel_output)
|
|
|
65 |
|
66 |
# Run TTS with phoneme input
|
67 |
input_phonemes = ['W', 'ER', 'DH', 'AH', 'L', 'IY', 'D', 'ER', 'Z', 'IH', 'N', 'DH', 'IH', 'S', 'L', 'AH', 'K', 'L', 'AH', 'S', 'CH', 'EY', 'N', 'JH', 'spn', 'DH', 'OW', 'AW', 'ER', 'OW', 'N', 'B', 'AE', 'S', 'K', 'ER', 'V', 'IH', 'L', 'spn', 'HH', 'UW', 'W', 'AA', 'Z', 'AE', 'T', 'W', 'ER', 'K', 'S', 'AH', 'M', 'Y', 'IH', 'R', 'Z', 'B', 'IH', 'F', 'AO', 'R', 'DH', 'EH', 'M', 'spn', 'W', 'EH', 'N', 'T', 'M', 'AH', 'CH', 'AA', 'N', 'DH', 'AH', 'S', 'EY', 'M', 'L', 'AY', 'N', 'Z', 'spn']
|
68 |
+
mel_output, durations, pitch, energy = fastspeech2.encode_phoneme(
|
69 |
+
[input_phonemes],
|
70 |
+
pace=1.0, # scale up/down the speed
|
71 |
+
pitch_rate=1.0, # scale up/down the pitch
|
72 |
+
energy_rate=1.0, # scale up/down the energy
|
73 |
+
)
|
74 |
|
75 |
# Running Vocoder (spectrogram-to-waveform)
|
76 |
waveforms = hifi_gan.decode_batch(mel_output)
|