Spaces:
Running
Running
import gradio as gr | |
import tempfile | |
from TTS.api import TTS | |
from huggingface_hub import hf_hub_download | |
import torch | |
import json | |
from snfl_imdann import TifinaghNumberConverter | |
import re | |
CUDA = torch.cuda.is_available() | |
REPO_ID = "ayymen/Coqui-TTS-Vits-Multispeaker" | |
VOICE_CONVERSION_MODELS = { | |
'freevc24': 'voice_conversion_models/multilingual/vctk/freevc24', | |
'openvoice_v1': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v1', | |
'openvoice_v2': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v2', | |
} | |
VARIANTS = {"Tachelhit": "shi", "Tarifit": "rif"} | |
SPEAKERS = ["yan", "sin", "idj"] | |
my_title = "ⴰⴹⵕⵉⵚ ⵙ ⵉⵎⵙⵍⵉ - Tamazight Text-to-Speech" | |
my_description = "This model is based on [VITS](https://github.com/jaywalnut310/vits), thanks to 🐸 [Coqui.ai](https://coqui.ai/)." | |
my_examples = [ | |
["ⴰⵣⵓⵍ. ⵎⴰⵏⵣⴰⴽⵉⵏ?", "shi", "yan", True], | |
["ⵡⴰ ⵜⴰⵎⵖⴰⵔⵜ ⵎⴰ ⴷ ⵓⴽⴰⵏ ⵜⵙⴽⵔⵜ?", "shi", "sin", False], | |
["ⴳⵏ ⴰⴷ ⴰⴽ ⵉⵙⵙⴳⵏ ⵕⴱⴱⵉ ⵉⵜⵜⵓ ⴽ.", "shi", "yan", False], | |
["ⴰⵔⵔⴰⵡ ⵏ ⵍⵀⵎⵎ ⵢⵓⴽⵔ ⴰⵖ ⵉⵀⴷⵓⵎⵏ ⵏⵏⵖ!", "shi", "yan", False], | |
["ⴰⵣⵓⵍ. ⵎⴰⵎⵛ ⵜⴷⵊⵉⵜ?", "rif", "idj", True], | |
["ⴰⵇⵎⵎⵓⵎ ⵉⵇⵏⴻⵏ ⵓⵔ ⵜ ⵜⵜⵉⴷⴼⵏ ⵉⵣⴰⵏ.", "rif", "idj", False], | |
["ⵇⵇⵉⵎ ⵅ ⵜⴰⴷⴷⴰⵔⵜ ⵏⵏⵛ!", "rif", "idj", False], | |
["ⵜⴻⵜⵜⵏ ⴰⴳ ⵡⵓⵛⵛⵏ, ⵜⵜⵔⵓⵏ ⵅ ⵓⵎⴽⵙⴰ.", "rif", "idj", False] | |
] | |
best_model_path = hf_hub_download(repo_id=REPO_ID, filename="checkpoint_390000.pth") | |
config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json") | |
api = TTS(model_path=best_model_path, config_path=config_path).to("cuda" if CUDA else "cpu") | |
# pre-download voice conversion models | |
for model in VOICE_CONVERSION_MODELS.values(): | |
api.load_vc_model_by_name(model, gpu=CUDA) | |
with open(config_path, "r") as f: | |
config = json.load(f) | |
available_chars = config["characters"]["characters"] | |
available_punct = config["characters"]["punctuations"] | |
available_chars = available_chars + "".join([str(i) for i in range(10)]) | |
placeholder = f"The available characters are: {available_chars} and the available punctuation is: {available_punct}" | |
my_inputs = [ | |
gr.Textbox(lines=5, label="Input Text", placeholder=placeholder), | |
gr.Dropdown(label="Variant", choices=list(VARIANTS.items()), value="shi"), | |
gr.Dropdown(label="Speaker", choices=SPEAKERS, value="yan"), | |
gr.Checkbox(label="Split Sentences (each sentence will be generated separately)", value=False), | |
gr.Audio(type="filepath", label="Speaker audio for voice cloning (optional)"), | |
gr.Dropdown(label="Voice Conversion Model", choices=list(VOICE_CONVERSION_MODELS.keys())), | |
] | |
my_outputs = gr.Audio(type="filepath", label="Output Audio", autoplay=True) | |
def tts(text: str, variant: str = "shi", speaker: str = "yan", split_sentences: bool = False, speaker_wav: str = None, voice_cv_model: str = 'freevc24'): | |
# replace oov characters | |
text = text.replace("\n", ". ") | |
text = text.replace("(", ",") | |
text = text.replace(")", ",") | |
text = text.replace('"', ",") | |
text = text.replace("'", ",") | |
text = text.replace(";", ",") | |
text = text.replace("-", " ") | |
# convert numbers to their spoken form | |
text = re.sub(r"\d+", lambda x: TifinaghNumberConverter.convert(int(x.group(0))), text) | |
with tempfile.NamedTemporaryFile(suffix = ".wav", delete = False) as fp: | |
if speaker_wav: | |
api.load_vc_model_by_name(VOICE_CONVERSION_MODELS[voice_cv_model], gpu=CUDA) | |
api.tts_with_vc_to_file(text, speaker_wav=speaker_wav, file_path=fp.name, split_sentences=split_sentences, speaker=speaker, language=variant) | |
else: | |
api.tts_to_file(text, file_path=fp.name, split_sentences=split_sentences, speaker=speaker, language=variant) | |
return fp.name | |
iface = gr.Interface( | |
fn=tts, | |
inputs=my_inputs, | |
outputs=my_outputs, | |
title=my_title, | |
description=my_description, | |
examples=my_examples, | |
cache_examples=True | |
) | |
iface.launch() | |