Moroccan-Darija-TTS

import spaces
import torch
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from pathlib import Path
import gradio as gr

CONFIG_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/config.json'
VOCAB_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/vocab.json'
MODEL_URL = 'https://huggingface.co/medmac01/darija_xtt_2.0/resolve/main/model_2.1.pth'
SPEAKER_AUDIO_URL = 'https://huggingface.co/medmac01/xtt2_darija_v0.1/resolve/main/speaker_reference.wav'

base_path = Path(__file__).parent

# Download the files into the base_path
config_path = base_path / 'config.json'
if not config_path.exists():
    torch.hub.download_url_to_file(CONFIG_URL, config_path)
vocab_path = base_path / 'vocab.json'
if not vocab_path.exists():
    torch.hub.download_url_to_file(VOCAB_URL, vocab_path)
model_path = base_path / 'model.pth'
if not model_path.exists():
    torch.hub.download_url_to_file(MODEL_URL, model_path)
speaker_audio_path = base_path / 'speaker_reference.wav'
if not speaker_audio_path.exists():
    torch.hub.download_url_to_file(SPEAKER_AUDIO_URL, speaker_audio_path)

config_path = str(config_path)
vocab_path = str(vocab_path)
model_path = str(model_path)
speaker_audio_path = str(speaker_audio_path)

config = XttsConfig()
config.load_json(config_path)

print("Loading model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_path=model_path, use_deepspeed=False, vocab_path=vocab_path, eval=True)
model.to(device)

@spaces.GPU
def infer_EGTTS(text: str, speaker_audio_path: str, temperature: float = 0.75):
    print("Computing speaker latents...")
    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=[speaker_audio_path])

    print("Inference...")
    out = model.inference(
        text,
        "ar",
        gpt_cond_latent,
        speaker_embedding,
        temperature=temperature,
    )

    return 24000, out["wav"]

markdown_description = """## Instructions:

1. Enter the text you want to synthesize.
2. Upload a 4-5 seconds audio file of the speaker you want to clone.
3. Click on the "Generate" button.

"""
with gr.Blocks(title="EGTTS") as app:
    gr.HTML("<center><h1>Moroccan-Darija-TTS </h1></center>")
    gr.Markdown(markdown_description)
    with gr.Row():
        with gr.Column():
            text = gr.Textbox(label="Text to synthesize", value="السلام عليكم ورحمة الله", rtl=True, text_align="right", lines=3)
            speaker_refrence = gr.Audio(label="Speaker reference", value=speaker_audio_path, type="filepath")
            temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, value=0.75, step=0.05)
            generate_btn = gr.Button(value="Generate", variant="primary")
        output = gr.Audio(label="Synthesized audio")
    
    generate_btn.click(infer_EGTTS, inputs=[text, speaker_refrence, temperature], outputs=output)

app.launch()