import gradio as gr import time import torch import scipy from espnet2.bin.tts_inference import Text2Speech from espnet2.utils.types import str_or_none lang = 'English' tag = 'kan-bayashi/ljspeech_vits' vocoder_tag = "none" text2speech = Text2Speech.from_pretrained( model_tag=str_or_none(tag), vocoder_tag=str_or_none(vocoder_tag), device="cpu", # Only for Tacotron 2 & Transformer threshold=0.5, # Only for Tacotron 2 minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, # Only for FastSpeech & FastSpeech2 & VITS speed_control_alpha=1.0, # Only for VITS noise_scale=0.333, noise_scale_dur=0.333, ) def inference(text): with torch.no_grad(): start = time.time() wav = text2speech(text)["wav"] scipy.io.wavfile.write("out.wav",text2speech.fs , wav.view(-1).cpu().numpy()) return "out.wav" title = "TTS" description = "demo for Anime2Sketch. To use it, simply upload your image, or click one of the examples to load them. Read more at the links below." article = "

Adversarial Open Domain Adaption for Sketch-to-Photo Synthesis | Github Repo

" gr.Interface( inference, "text", gr.outputs.Audio(type="file", label="Output"), title=title, description=description, article=article, enable_queue=True ).launch(debug=True)