File size: 4,989 Bytes
5ca847f
 
 
db3663c
5ca847f
 
aa93b1b
5ca847f
 
6b12cc3
5ca847f
 
 
e07a041
 
 
 
 
 
 
 
 
 
 
 
5ca847f
 
aa93b1b
 
 
 
 
 
 
5ca847f
 
aa93b1b
5ca847f
 
e07a041
db3663c
 
 
 
 
 
 
 
 
 
 
5ca847f
 
 
 
51c71fc
 
5ca847f
 
 
 
4931874
5ca847f
db3663c
5ca847f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db3663c
5ca847f
db3663c
 
5ca847f
 
 
db3663c
 
 
 
f7c7f99
6dfe6e8
937d301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6dfe6e8
 
 
 
 
318fc09
db3663c
318fc09
db3663c
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import io
import os
import tempfile
from typing import List

import TTS.api
import TTS.utils.manage as manage
import torch
from pydub import AudioSegment
import gradio as gr

import config

try:
    import spaces
    USING_SPACES = True
except ImportError:
    USING_SPACES = False

def gpu_decorator(func):
    if USING_SPACES:
        return spaces.GPU(func)
    else:
        return func

device = "cuda" if torch.cuda.is_available() else "cpu"

def ask_tos_patch(self, output_path):
    print("Automatically accepting the terms of service.")
    return True

manage.ModelManager.ask_tos = ask_tos_patch
tts = TTS.api.TTS()

models = {}
for id, model in config.models.items():
    tts.download_model_by_name(model)
    models[id] = TTS.api.TTS(model).to(device)

@gpu_decorator
def synthesize_tts(
    text: str = 'Hello, World!',
    speaker_wavs: List[gr.File] = None,
    speaker_idx: str = 'Ana Florence',
    language: str = 'ja',
    temperature: float = 0.65,
    top_k: int = 50,
    top_p: float = 0.8,
    speed: float = 1.0,
    enable_text_splitting: bool = True,
):
    temp_files = []
    try:
        if speaker_wavs:
            for speaker_wav in speaker_wavs:
                with open(speaker_wav.name, "rb") as f:
                    speaker_wav_bytes = f.read()
                try:
                    audio = AudioSegment.from_file(io.BytesIO(speaker_wav_bytes))
                    wav_buffer = io.BytesIO()
                    audio.export(wav_buffer, format="wav")
                    wav_buffer.seek(0) 
                except Exception as e:
                    return f"Error processing audio file: {e}"

                temp_wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
                temp_wav_file.write(wav_buffer.read())
                temp_wav_file.close()
                temp_files.append(temp_wav_file.name)

        output_buffer = io.BytesIO()
        if temp_files:
            models['multi'].tts_to_file(
                text=text,
                speaker_wav=temp_files, 
                language=language,
                file_path=output_buffer, 
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                speed=speed,
                enable_text_splitting=enable_text_splitting
            )
        else:
            models['multi'].tts_to_file(
                text=text,
                speaker=speaker_idx, 
                language=language,
                file_path=output_buffer, 
                temperature=temperature,
                top_k=top_k,
                top_p=top_p,
                speed=speed,
                enable_text_splitting=enable_text_splitting
            )
        
        output_buffer.seek(0)
        return output_buffer.read()

    finally:
        for temp_file in temp_files:
            if isinstance(temp_file, str) and os.path.exists(temp_file):
                os.remove(temp_file)

inputs = [
    gr.Textbox(value="Hello, World!", label="Text to Synthesize"),
    gr.Files(label="Voice Clone(optional)"),
    gr.Dropdown(
        choices=[
            "Claribel Dervla", "Daisy Studious", "Gracie Wise", "Tammie Ema", "Alison Dietlinde", "Ana Florence",
            "Annmarie Nele", "Asya Anara", "Brenda Stern", "Gitta Nikolina", "Henriette Usha", "Sofia Hellen",
            "Tammy Grit", "Tanja Adelina", "Vjollca Johnnie", "Andrew Chipper", "Badr Odhiambo", "Dionisio Schuyler",
            "Royston Min", "Viktor Eka", "Abrahan Mack", "Adde Michal", "Baldur Sanjin", "Craig Gutsy",
            "Damien Black", "Gilberto Mathias", "Ilkin Urbano", "Kazuhiko Atallah", "Ludvig Milivoj", "Suad Qasim",
            "Torcull Diarmuid", "Viktor Menelaos", "Zacharie Aimilios", "Nova Hogarth", "Maja Ruoho", "Uta Obando",
            "Lidiya Szekeres", "Chandra MacFarland", "Szofi Granger", "Camilla Holmström", "Lilya Stainthorpe",
            "Zofija Kendrick", "Narelle Moon", "Barbora MacLean", "Alexandra Hisakawa", "Alma María", "Rosemary Okafor",
            "Ige Behringer", "Filip Traverse", "Damjan Chapman", "Wulf Carlevaro", "Aaron Dreschner", "Kumar Dahl",
            "Eugenio Mataracı", "Ferran Simen", "Xavier Hayasaka", "Luis Moray", "Marcos Rudaski"
        ],
        value="Ana Florence", 
        label="Speaker Index"
    ),
    gr.Dropdown(
        choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh", "ja", "hu", "ko"],
        value="en",
        label="Language"
    ),
    gr.Slider(0, 2, value=1, step=0.01, label="Temperature"),
    gr.Slider(1, 100, value=50, step=1, label="Top-K"),
    gr.Slider(0, 1, value=1, step=0.01, label="Top-P"),
    gr.Slider(0.5, 2, value=1.0, step=0.01, label="Speed"),
    gr.Checkbox(value=True, label="Enable Text Splitting")
]

outputs = gr.Audio(label="Generated Speech")

gr.Interface(
    fn=synthesize_tts, 
    inputs=inputs, 
    outputs=outputs, 
    title="Text-to-Speech Synthesis with Gradio"
).launch()