import spaces import torch from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts from pathlib import Path import gradio as gr CONFIG_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/config.json' VOCAB_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/vocab.json' MODEL_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/model_2.1.pth' SPEAKER_AUDIO_URL = 'https://huggingface.co/medmac01/xtt2_darija_v0.1/resolve/main/speaker_reference.wav' base_path = Path(__file__).parent # Download the files into the base_path config_path = base_path / 'config.json' if not config_path.exists(): torch.hub.download_url_to_file(CONFIG_URL, config_path) vocab_path = base_path / 'vocab.json' if not vocab_path.exists(): torch.hub.download_url_to_file(VOCAB_URL, vocab_path) model_path = base_path / 'model.pth' if not model_path.exists(): torch.hub.download_url_to_file(MODEL_URL, model_path) speaker_audio_path = base_path / 'speaker_reference.wav' if not speaker_audio_path.exists(): torch.hub.download_url_to_file(SPEAKER_AUDIO_URL, speaker_audio_path) config_path = str(config_path) vocab_path = str(vocab_path) model_path = str(model_path) speaker_audio_path = str(speaker_audio_path) config = XttsConfig() config.load_json(config_path) print("Loading model...") device = "cuda" if torch.cuda.is_available() else "cpu" print(device) model = Xtts.init_from_config(config) model.load_checkpoint(config, checkpoint_path=model_path, use_deepspeed=False, vocab_path=vocab_path, eval=True) model.to(device) @spaces.GPU def infer_EGTTS(text: str, speaker_audio_path: str, temperature: float = 0.75): print("Computing speaker latents...") gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[speaker_audio_path]) print("Inference...") out = model.inference( text, "ar", gpt_cond_latent, speaker_embedding, temperature=temperature, ) return 24000, out["wav"] markdown_description = """## Instructions: 1. Enter the text you want to synthesize. 2. Upload a 4-5 seconds audio file of the speaker you want to clone. 3. Click on the "Generate" button. """ with gr.Blocks(title="EGTTS") as app: gr.HTML("