Spaces:
Runtime error
Runtime error
import spaces | |
import gradio as gr | |
import torch | |
from TTS.api import TTS | |
import os | |
import argparse | |
import os | |
import sys | |
import tempfile | |
import librosa.display | |
import numpy as np | |
import torchaudio | |
import traceback | |
from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list | |
from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt | |
from TTS.tts.configs.xtts_config import XttsConfig | |
from TTS.tts.models.xtts import Xtts | |
os.environ["COQUI_TOS_AGREED"] = "1" | |
device = "cuda" | |
tts = TTS("tts_models/multilingual/multi-dataset/xtts_bill_spa").to(device) | |
model_path = '/home/user/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_bill_spa/model.pth' | |
config_path = '/home/user/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_bill_spa/config.json' | |
vocab_path = '/home/user/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_bill_spa/vocab.json' | |
def clear_gpu_cache(): | |
# clear the GPU cache | |
if torch.cuda.is_available(): | |
torch.cuda.empty_cache() | |
XTTS_MODEL = None | |
def load_model(xtts_checkpoint, xtts_config, xtts_vocab): | |
global XTTS_MODEL | |
clear_gpu_cache() | |
if not xtts_checkpoint or not xtts_config or not xtts_vocab: | |
return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!" | |
config = XttsConfig() | |
config.load_json(xtts_config) | |
XTTS_MODEL = Xtts.init_from_config(config) | |
print("Loading XTTS model! ") | |
XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False) | |
if torch.cuda.is_available(): | |
XTTS_MODEL.cuda() | |
print("Model Loaded!") | |
def run_tts(lang, tts_text, speaker_audio_file): | |
if XTTS_MODEL is None or not speaker_audio_file: | |
return "You need to run the previous step to load the model !!", None, None | |
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs) | |
out = XTTS_MODEL.inference( | |
text=tts_text, | |
language=lang, | |
gpt_cond_latent=gpt_cond_latent, | |
speaker_embedding=speaker_embedding, | |
temperature=XTTS_MODEL.config.temperature, # Add custom parameters here | |
length_penalty=XTTS_MODEL.config.length_penalty, | |
repetition_penalty=XTTS_MODEL.config.repetition_penalty, | |
top_k=XTTS_MODEL.config.top_k, | |
top_p=XTTS_MODEL.config.top_p, | |
) | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: | |
out["wav"] = torch.tensor(out["wav"]).unsqueeze(0) | |
out_path = fp.name | |
torchaudio.save(out_path, out["wav"], 24000) | |
print("Speech generated !") | |
return out_path, speaker_audio_file | |
def generate(text, audio): | |
load_model(model_path, config_path, vocab_path) | |
out_path, speaker_audio_file = run_tts(lang='es', tts_text=text, speaker_audio_file=audio) | |
return out_path | |
demo = gr.Interface( | |
fn=generate, | |
inputs=[gr.Textbox(label='Frase a generar'), gr.Audio(type='filepath', label='Voz de referencia')], | |
outputs=gr.Audio(type='filepath') | |
) | |
demo.launch() |