Spaces:
Running
Running
import io | |
import os | |
import tempfile | |
from typing import List | |
import TTS.api | |
import TTS.utils.manage as manage | |
import torch | |
from pydub import AudioSegment | |
import gradio as gr | |
import config | |
try: | |
import spaces | |
USING_SPACES = True | |
except ImportError: | |
USING_SPACES = False | |
def gpu_decorator(func): | |
if USING_SPACES: | |
return spaces.GPU(func) | |
else: | |
return func | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
def ask_tos_patch(self, output_path): | |
print("Automatically accepting the terms of service.") | |
return True | |
manage.ModelManager.ask_tos = ask_tos_patch | |
tts = TTS.api.TTS() | |
models = {} | |
for id, model in config.models.items(): | |
tts.download_model_by_name(model) | |
models[id] = TTS.api.TTS(model).to(device) | |
def synthesize_tts( | |
text: str = 'Hello, World!', | |
speaker_wavs: List[gr.File] = None, | |
speaker_idx: str = 'Ana Florence', | |
language: str = 'ja', | |
temperature: float = 0.65, | |
top_k: int = 50, | |
top_p: float = 0.8, | |
speed: float = 1.0, | |
enable_text_splitting: bool = True, | |
): | |
temp_files = [] | |
try: | |
if speaker_wavs: | |
for speaker_wav in speaker_wavs: | |
with open(speaker_wav.name, "rb") as f: | |
speaker_wav_bytes = f.read() | |
try: | |
audio = AudioSegment.from_file(io.BytesIO(speaker_wav_bytes)) | |
wav_buffer = io.BytesIO() | |
audio.export(wav_buffer, format="wav") | |
wav_buffer.seek(0) | |
except Exception as e: | |
return f"Error processing audio file: {e}" | |
temp_wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
temp_wav_file.write(wav_buffer.read()) | |
temp_wav_file.close() | |
temp_files.append(temp_wav_file.name) | |
output_buffer = io.BytesIO() | |
if temp_files: | |
models['multi'].tts_to_file( | |
text=text, | |
speaker_wav=temp_files, | |
language=language, | |
file_path=output_buffer, | |
temperature=temperature, | |
top_k=top_k, | |
top_p=top_p, | |
speed=speed, | |
enable_text_splitting=enable_text_splitting | |
) | |
else: | |
models['multi'].tts_to_file( | |
text=text, | |
speaker=speaker_idx, | |
language=language, | |
file_path=output_buffer, | |
temperature=temperature, | |
top_k=top_k, | |
top_p=top_p, | |
speed=speed, | |
enable_text_splitting=enable_text_splitting | |
) | |
output_buffer.seek(0) | |
return output_buffer.read() | |
finally: | |
for temp_file in temp_files: | |
if isinstance(temp_file, str) and os.path.exists(temp_file): | |
os.remove(temp_file) | |
inputs = [ | |
gr.Textbox(value="Hello, World!", label="Text to Synthesize"), | |
gr.Audio(sources=['upload', 'microphone'], label="Voice Clone(optional)", type="filepath"), | |
gr.Dropdown( | |
choices=[ | |
"Claribel Dervla", "Daisy Studious", "Gracie Wise", "Tammie Ema", "Alison Dietlinde", "Ana Florence", | |
"Annmarie Nele", "Asya Anara", "Brenda Stern", "Gitta Nikolina", "Henriette Usha", "Sofia Hellen", | |
"Tammy Grit", "Tanja Adelina", "Vjollca Johnnie", "Andrew Chipper", "Badr Odhiambo", "Dionisio Schuyler", | |
"Royston Min", "Viktor Eka", "Abrahan Mack", "Adde Michal", "Baldur Sanjin", "Craig Gutsy", | |
"Damien Black", "Gilberto Mathias", "Ilkin Urbano", "Kazuhiko Atallah", "Ludvig Milivoj", "Suad Qasim", | |
"Torcull Diarmuid", "Viktor Menelaos", "Zacharie Aimilios", "Nova Hogarth", "Maja Ruoho", "Uta Obando", | |
"Lidiya Szekeres", "Chandra MacFarland", "Szofi Granger", "Camilla Holmström", "Lilya Stainthorpe", | |
"Zofija Kendrick", "Narelle Moon", "Barbora MacLean", "Alexandra Hisakawa", "Alma María", "Rosemary Okafor", | |
"Ige Behringer", "Filip Traverse", "Damjan Chapman", "Wulf Carlevaro", "Aaron Dreschner", "Kumar Dahl", | |
"Eugenio Mataracı", "Ferran Simen", "Xavier Hayasaka", "Luis Moray", "Marcos Rudaski" | |
], | |
value="Ana Florence", | |
label="Speaker Index" | |
), | |
gr.Dropdown( | |
choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh", "ja", "hu", "ko"], | |
value="en", | |
label="Language" | |
), | |
gr.Slider(0, 2, value=1, step=0.01, label="Temperature"), | |
gr.Slider(1, 100, value=50, step=1, label="Top-K"), | |
gr.Slider(0, 1, value=1, step=0.01, label="Top-P"), | |
gr.Slider(0.5, 2, value=1.0, step=0.01, label="Speed"), | |
gr.Checkbox(value=True, label="Enable Text Splitting") | |
] | |
outputs = gr.Audio(label="Generated Speech") | |
gr.Interface( | |
fn=synthesize_tts, | |
inputs=inputs, | |
outputs=outputs, | |
title="Text-to-Speech Synthesis with Gradio" | |
).launch() | |