File size: 4,214 Bytes
e95bc25
 
15e7b85
e95bc25
2c32692
5536535
 
 
2c32692
 
e95bc25
ad86f4b
e95bc25
4e602d2
edbfc9a
4e602d2
 
 
 
ad86f4b
 
 
 
845cab6
e95bc25
 
 
ad86f4b
 
 
 
 
 
 
 
e95bc25
 
5536535
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e95bc25
5536535
ad86f4b
 
 
 
 
e95bc25
 
15e7b85
e95bc25
ad86f4b
f194fff
 
 
 
4e602d2
ad86f4b
f194fff
96fabd6
f194fff
5536535
 
 
e95bc25
15e7b85
4e602d2
ad86f4b
15e7b85
ad86f4b
15e7b85
5536535
15e7b85
e95bc25
 
 
 
 
15e7b85
 
e95bc25
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import gradio as gr
import tempfile
from TTS.api import TTS
from huggingface_hub import hf_hub_download
import torch
import json
from snfl_imdann import TifinaghNumberConverter
import re

CUDA = torch.cuda.is_available()

REPO_ID = "ayymen/Coqui-TTS-Vits-Multispeaker"

VOICE_CONVERSION_MODELS = {
    'freevc24': 'voice_conversion_models/multilingual/vctk/freevc24',
    'openvoice_v1': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v1',
    'openvoice_v2': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v2',
}

VARIANTS = {"Tachelhit": "shi", "Tarifit": "rif"}

SPEAKERS = ["yan", "sin", "idj"]

my_title = "ⴰⴹⵕⵉⵚ ⵙ ⵉⵎⵙⵍⵉ - Tamazight Text-to-Speech"
my_description = "This model is based on [VITS](https://github.com/jaywalnut310/vits), thanks to 🐸 [Coqui.ai](https://coqui.ai/)." 

my_examples = [
    ["ⴰⵣⵓⵍ. ⵎⴰⵏⵣⴰⴽⵉⵏ?", "shi", "yan", True],
    ["ⵡⴰ ⵜⴰⵎⵖⴰⵔⵜ ⵎⴰ ⴷ ⵓⴽⴰⵏ ⵜⵙⴽⵔⵜ?", "shi", "sin", False],
    ["ⴳⵏ ⴰⴷ ⴰⴽ ⵉⵙⵙⴳⵏ ⵕⴱⴱⵉ ⵉⵜⵜⵓ ⴽ.", "shi", "yan", False],
    ["ⴰⵔⵔⴰⵡ ⵏ ⵍⵀⵎⵎ ⵢⵓⴽⵔ ⴰⵖ ⵉⵀⴷⵓⵎⵏ ⵏⵏⵖ!", "shi", "yan", False],
    ["ⴰⵣⵓⵍ. ⵎⴰⵎⵛ ⵜⴷⵊⵉⵜ?", "rif", "idj", True],
    ["ⴰⵇⵎⵎⵓⵎ ⵉⵇⵏⴻⵏ ⵓⵔ ⵜ ⵜⵜⵉⴷⴼⵏ ⵉⵣⴰⵏ.", "rif", "idj", False],
    ["ⵇⵇⵉⵎ ⵅ ⵜⴰⴷⴷⴰⵔⵜ ⵏⵏⵛ!", "rif", "idj", False],
    ["ⵜⴻⵜⵜⵏ ⴰⴳ ⵡⵓⵛⵛⵏ, ⵜⵜⵔⵓⵏ ⵅ ⵓⵎⴽⵙⴰ.", "rif", "idj", False]
]

best_model_path = hf_hub_download(repo_id=REPO_ID, filename="checkpoint_390000.pth") 
config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json")

api = TTS(model_path=best_model_path, config_path=config_path).to("cuda" if CUDA else "cpu")

# pre-download voice conversion models
for model in VOICE_CONVERSION_MODELS.values():
    api.load_vc_model_by_name(model, gpu=CUDA)

with open(config_path, "r") as f:
    config = json.load(f)
    available_chars = config["characters"]["characters"]
    available_punct = config["characters"]["punctuations"]

available_chars = available_chars + "".join([str(i) for i in range(10)])

placeholder = f"The available characters are: {available_chars} and the available punctuation is: {available_punct}"

my_inputs = [
    gr.Textbox(lines=5, label="Input Text", placeholder=placeholder),
    gr.Dropdown(label="Variant", choices=list(VARIANTS.items()), value="shi"),
    gr.Dropdown(label="Speaker", choices=SPEAKERS, value="yan"),
    gr.Checkbox(label="Split Sentences (each sentence will be generated separately)", value=False),
    gr.Audio(type="filepath", label="Speaker audio for voice cloning (optional)"),
    gr.Dropdown(label="Voice Conversion Model", choices=list(VOICE_CONVERSION_MODELS.keys())),
]

my_outputs = gr.Audio(type="filepath", label="Output Audio", autoplay=True)

def tts(text: str, variant: str = "shi", speaker: str = "yan", split_sentences: bool = False, speaker_wav: str = None, voice_cv_model: str = 'freevc24'):
    # replace oov characters
    text = text.replace("\n", ". ")
    text = text.replace("(", ",")
    text = text.replace(")", ",")
    text = text.replace('"', ",")
    text = text.replace("'", ",")
    text = text.replace(";", ",")
    text = text.replace("-", " ")

    # convert numbers to their spoken form
    text = re.sub(r"\d+", lambda x: TifinaghNumberConverter.convert(int(x.group(0))), text)

    with tempfile.NamedTemporaryFile(suffix = ".wav", delete = False) as fp:
        if speaker_wav:
            api.load_vc_model_by_name(VOICE_CONVERSION_MODELS[voice_cv_model], gpu=CUDA)
            api.tts_with_vc_to_file(text, speaker_wav=speaker_wav, file_path=fp.name, split_sentences=split_sentences, speaker=speaker, language=variant)
        else:
            api.tts_to_file(text, file_path=fp.name, split_sentences=split_sentences, speaker=speaker, language=variant)

    return fp.name

iface = gr.Interface(
    fn=tts, 
    inputs=my_inputs, 
    outputs=my_outputs, 
    title=my_title, 
    description=my_description, 
    examples=my_examples,
    cache_examples=True
)
iface.launch()