File size: 4,263 Bytes
7602717 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
## VCTK
import torch
import os
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from scipy.io.wavfile import write
import gradio as gr
print("Running GRadio", gr.__version__)
model_path = "vits2_pytorch/G_390000.pth"
config_path = "vits2_pytorch/vits2_vctk_cat_inference.json"
hps = utils.get_hparams_from_file(config_path)
if (
"use_mel_posterior_encoder" in hps.model.keys()
and hps.model.use_mel_posterior_encoder == True
):
print("Using mel posterior encoder for VITS2")
posterior_channels = 80 # vits2
hps.data.use_mel_posterior_encoder = True
else:
print("Using lin posterior encoder for VITS1")
posterior_channels = hps.data.filter_length // 2 + 1
hps.data.use_mel_posterior_encoder = False
net_g = SynthesizerTrn(
len(symbols),
posterior_channels,
hps.train.segment_size // hps.data.hop_length,
n_speakers=hps.data.n_speakers,
**hps.model
)
_ = net_g.eval()
_ = utils.load_checkpoint(model_path, net_g, None)
def get_text(text, hps):
text_norm = text_to_sequence(text, hps.data.text_cleaners)
#text_norm = cleaned_text_to_sequence(text) # if model was trained with text
if hps.data.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def tts(text:str, speaker_id:int, speed:float, noise_scale:float=0.667, noise_scale_w:float=0.8):
stn_tst = get_text(text, hps)
with torch.no_grad():
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
sid = torch.LongTensor([speaker_id])
waveform = (
net_g.infer(
x_tst,
x_tst_lengths,
sid=sid,
noise_scale=noise_scale,
noise_scale_w=noise_scale_w,
length_scale=1/speed,
)[0][0, 0]
.data.cpu()
.float()
.numpy()
)
return gr.make_waveform((22050, waveform))
## GUI space
title = """
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;"
> <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
VITS2 TTS Catalan Demo
</h1> </div>
</div>
"""
description = """
VITS2 is an end-to-end speech synthesis model that predicts a speech waveform conditional on an input text sequence. VITS2 improved the
training and inference efficiency and naturalness by introducing adversarial learning into the duration predictor. The transformer
block was added to the normalizing flows to capture the long-term dependency when transforming the distribution.
The synthesis quality was improved by incorporating Gaussian noise into the alignment search.
This model is being trained in openslr69 and festcat datasets
"""
article = "Model by Jungil Kong, et al. from SK telecom. Demo by BSC."
vits2_inference = gr.Interface(
fn=tts,
inputs=[
gr.Textbox(
value="m'ha costat desenvolupar molt una veu, i ara que la tinc no estaré en silenci.",
max_lines=1,
label="Input text",
),
gr.Slider(
1,
47,
value=10,
step=1,
label="Speaker id",
info=f"This model is trained on 47 speakers. You can prompt the model using one of these speaker ids.",
),
gr.Slider(
0.5,
1.5,
value=1,
step=0.1,
label="Speed",
),
gr.Slider(
0.2,
2.0,
value=0.667,
step=0.01,
label="Noise scale",
),
gr.Slider(
0.2,
2.0,
value=0.8,
step=0.01,
label="Noise scale w",
),
],
outputs=gr.Audio(),
)
demo = gr.Blocks()
with demo:
gr.Markdown(title)
gr.Markdown(description)
gr.TabbedInterface([vits2_inference], ["Multispeaker"])
gr.Markdown(article)
demo.queue(max_size=10)
demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860) |