Spaces:
Sleeping
Sleeping
File size: 4,989 Bytes
5ca847f db3663c 5ca847f aa93b1b 5ca847f 6b12cc3 5ca847f e07a041 5ca847f aa93b1b 5ca847f aa93b1b 5ca847f e07a041 db3663c 5ca847f 51c71fc 5ca847f 4931874 5ca847f db3663c 5ca847f db3663c 5ca847f db3663c 5ca847f db3663c f7c7f99 6dfe6e8 937d301 6dfe6e8 318fc09 db3663c 318fc09 db3663c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import io
import os
import tempfile
from typing import List
import TTS.api
import TTS.utils.manage as manage
import torch
from pydub import AudioSegment
import gradio as gr
import config
try:
import spaces
USING_SPACES = True
except ImportError:
USING_SPACES = False
def gpu_decorator(func):
if USING_SPACES:
return spaces.GPU(func)
else:
return func
device = "cuda" if torch.cuda.is_available() else "cpu"
def ask_tos_patch(self, output_path):
print("Automatically accepting the terms of service.")
return True
manage.ModelManager.ask_tos = ask_tos_patch
tts = TTS.api.TTS()
models = {}
for id, model in config.models.items():
tts.download_model_by_name(model)
models[id] = TTS.api.TTS(model).to(device)
@gpu_decorator
def synthesize_tts(
text: str = 'Hello, World!',
speaker_wavs: List[gr.File] = None,
speaker_idx: str = 'Ana Florence',
language: str = 'ja',
temperature: float = 0.65,
top_k: int = 50,
top_p: float = 0.8,
speed: float = 1.0,
enable_text_splitting: bool = True,
):
temp_files = []
try:
if speaker_wavs:
for speaker_wav in speaker_wavs:
with open(speaker_wav.name, "rb") as f:
speaker_wav_bytes = f.read()
try:
audio = AudioSegment.from_file(io.BytesIO(speaker_wav_bytes))
wav_buffer = io.BytesIO()
audio.export(wav_buffer, format="wav")
wav_buffer.seek(0)
except Exception as e:
return f"Error processing audio file: {e}"
temp_wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
temp_wav_file.write(wav_buffer.read())
temp_wav_file.close()
temp_files.append(temp_wav_file.name)
output_buffer = io.BytesIO()
if temp_files:
models['multi'].tts_to_file(
text=text,
speaker_wav=temp_files,
language=language,
file_path=output_buffer,
temperature=temperature,
top_k=top_k,
top_p=top_p,
speed=speed,
enable_text_splitting=enable_text_splitting
)
else:
models['multi'].tts_to_file(
text=text,
speaker=speaker_idx,
language=language,
file_path=output_buffer,
temperature=temperature,
top_k=top_k,
top_p=top_p,
speed=speed,
enable_text_splitting=enable_text_splitting
)
output_buffer.seek(0)
return output_buffer.read()
finally:
for temp_file in temp_files:
if isinstance(temp_file, str) and os.path.exists(temp_file):
os.remove(temp_file)
inputs = [
gr.Textbox(value="Hello, World!", label="Text to Synthesize"),
gr.Files(label="Voice Clone(optional)"),
gr.Dropdown(
choices=[
"Claribel Dervla", "Daisy Studious", "Gracie Wise", "Tammie Ema", "Alison Dietlinde", "Ana Florence",
"Annmarie Nele", "Asya Anara", "Brenda Stern", "Gitta Nikolina", "Henriette Usha", "Sofia Hellen",
"Tammy Grit", "Tanja Adelina", "Vjollca Johnnie", "Andrew Chipper", "Badr Odhiambo", "Dionisio Schuyler",
"Royston Min", "Viktor Eka", "Abrahan Mack", "Adde Michal", "Baldur Sanjin", "Craig Gutsy",
"Damien Black", "Gilberto Mathias", "Ilkin Urbano", "Kazuhiko Atallah", "Ludvig Milivoj", "Suad Qasim",
"Torcull Diarmuid", "Viktor Menelaos", "Zacharie Aimilios", "Nova Hogarth", "Maja Ruoho", "Uta Obando",
"Lidiya Szekeres", "Chandra MacFarland", "Szofi Granger", "Camilla Holmström", "Lilya Stainthorpe",
"Zofija Kendrick", "Narelle Moon", "Barbora MacLean", "Alexandra Hisakawa", "Alma María", "Rosemary Okafor",
"Ige Behringer", "Filip Traverse", "Damjan Chapman", "Wulf Carlevaro", "Aaron Dreschner", "Kumar Dahl",
"Eugenio Mataracı", "Ferran Simen", "Xavier Hayasaka", "Luis Moray", "Marcos Rudaski"
],
value="Ana Florence",
label="Speaker Index"
),
gr.Dropdown(
choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh", "ja", "hu", "ko"],
value="en",
label="Language"
),
gr.Slider(0, 2, value=1, step=0.01, label="Temperature"),
gr.Slider(1, 100, value=50, step=1, label="Top-K"),
gr.Slider(0, 1, value=1, step=0.01, label="Top-P"),
gr.Slider(0.5, 2, value=1.0, step=0.01, label="Speed"),
gr.Checkbox(value=True, label="Enable Text Splitting")
]
outputs = gr.Audio(label="Generated Speech")
gr.Interface(
fn=synthesize_tts,
inputs=inputs,
outputs=outputs,
title="Text-to-Speech Synthesis with Gradio"
).launch()
|