import spaces import gradio as gr import torch from TTS.api import TTS import os import argparse import os import sys import tempfile import librosa.display import numpy as np import torchaudio import traceback from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts os.environ["COQUI_TOS_AGREED"] = "1" device = "cuda" tts = TTS("tts_models/multilingual/multi-dataset/xtts_bill_spa").to(device) model_path = '/home/user/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_bill_spa/model.pth' config_path = '/home/user/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_bill_spa/config.json' vocab_path = '/home/user/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_bill_spa/vocab.json' def clear_gpu_cache(): # clear the GPU cache if torch.cuda.is_available(): torch.cuda.empty_cache() XTTS_MODEL = None def load_model(xtts_checkpoint, xtts_config, xtts_vocab): global XTTS_MODEL clear_gpu_cache() if not xtts_checkpoint or not xtts_config or not xtts_vocab: return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!" config = XttsConfig() config.load_json(xtts_config) XTTS_MODEL = Xtts.init_from_config(config) print("Loading XTTS model! ") XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False) if torch.cuda.is_available(): XTTS_MODEL.cuda() print("Model Loaded!") def run_tts(lang, tts_text, speaker_audio_file): if XTTS_MODEL is None or not speaker_audio_file: return "You need to run the previous step to load the model !!", None, None gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(audio_path=speaker_audio_file, gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, max_ref_length=XTTS_MODEL.config.max_ref_len, sound_norm_refs=XTTS_MODEL.config.sound_norm_refs) out = XTTS_MODEL.inference( text=tts_text, language=lang, gpt_cond_latent=gpt_cond_latent, speaker_embedding=speaker_embedding, temperature=XTTS_MODEL.config.temperature, # Add custom parameters here length_penalty=XTTS_MODEL.config.length_penalty, repetition_penalty=XTTS_MODEL.config.repetition_penalty, top_k=XTTS_MODEL.config.top_k, top_p=XTTS_MODEL.config.top_p, ) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: out["wav"] = torch.tensor(out["wav"]).unsqueeze(0) out_path = fp.name torchaudio.save(out_path, out["wav"], 24000) print("Speech generated !") return out_path, speaker_audio_file @spaces.GPU(enable_queue=True) def generate(text, audio): load_model(model_path, config_path, vocab_path) out_path, speaker_audio_file = run_tts(lang='es', tts_text=text, speaker_audio_file=audio) return out_path demo = gr.Interface( fn=generate, inputs=[gr.Textbox(label='Frase a generar'), gr.Audio(type='filepath', label='Voz de referencia')], outputs=gr.Audio(type='filepath') ) demo.launch()