Spaces:

TaiYouWeb
/

tts-xtts2-multi

Running

App Files Files Community

tts-xtts2-multi / app.py

TaiYouWeb

Update app.py

9111f28 verified 29 days ago

raw

history blame

5.04 kB

	import io
	import os
	import tempfile
	from typing import List

	import TTS.api
	import TTS.utils.manage as manage
	import torch
	from pydub import AudioSegment
	import gradio as gr

	import config

	try:
	import spaces
	USING_SPACES = True
	except ImportError:
	USING_SPACES = False

	def gpu_decorator(func):
	if USING_SPACES:
	return spaces.GPU(func)
	else:
	return func

	device = "cuda" if torch.cuda.is_available() else "cpu"

	def ask_tos_patch(self, output_path):
	print("Automatically accepting the terms of service.")
	return True

	manage.ModelManager.ask_tos = ask_tos_patch
	tts = TTS.api.TTS()

	models = {}
	for id, model in config.models.items():
	tts.download_model_by_name(model)
	models[id] = TTS.api.TTS(model).to(device)

	@gpu_decorator
	def synthesize_tts(
	text: str = 'Hello, World!',
	speaker_wavs: List[gr.File] = None,
	speaker_idx: str = 'Ana Florence',
	language: str = 'ja',
	temperature: float = 0.65,
	top_k: int = 50,
	top_p: float = 0.8,
	speed: float = 1.0,
	enable_text_splitting: bool = True,
	):
	temp_files = []
	try:
	if speaker_wavs:
	for speaker_wav in speaker_wavs:
	with open(speaker_wav.name, "rb") as f:
	speaker_wav_bytes = f.read()
	try:
	audio = AudioSegment.from_file(io.BytesIO(speaker_wav_bytes))
	wav_buffer = io.BytesIO()
	audio.export(wav_buffer, format="wav")
	wav_buffer.seek(0)
	except Exception as e:
	return f"Error processing audio file: {e}"

	temp_wav_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	temp_wav_file.write(wav_buffer.read())
	temp_wav_file.close()
	temp_files.append(temp_wav_file.name)

	output_buffer = io.BytesIO()
	if temp_files:
	models['multi'].tts_to_file(
	text=text,
	speaker_wav=temp_files,
	language=language,
	file_path=output_buffer,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	speed=speed,
	enable_text_splitting=enable_text_splitting
	)
	else:
	models['multi'].tts_to_file(
	text=text,
	speaker=speaker_idx,
	language=language,
	file_path=output_buffer,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	speed=speed,
	enable_text_splitting=enable_text_splitting
	)

	output_buffer.seek(0)
	return output_buffer.read()

	finally:
	for temp_file in temp_files:
	if isinstance(temp_file, str) and os.path.exists(temp_file):
	os.remove(temp_file)

	inputs = [
	gr.Textbox(value="Hello, World!", label="Text to Synthesize"),
	gr.Audio(sources=['upload', 'microphone'], label="Voice Clone(optional)", type="filepath"),
	gr.Dropdown(
	choices=[
	"Claribel Dervla", "Daisy Studious", "Gracie Wise", "Tammie Ema", "Alison Dietlinde", "Ana Florence",
	"Annmarie Nele", "Asya Anara", "Brenda Stern", "Gitta Nikolina", "Henriette Usha", "Sofia Hellen",
	"Tammy Grit", "Tanja Adelina", "Vjollca Johnnie", "Andrew Chipper", "Badr Odhiambo", "Dionisio Schuyler",
	"Royston Min", "Viktor Eka", "Abrahan Mack", "Adde Michal", "Baldur Sanjin", "Craig Gutsy",
	"Damien Black", "Gilberto Mathias", "Ilkin Urbano", "Kazuhiko Atallah", "Ludvig Milivoj", "Suad Qasim",
	"Torcull Diarmuid", "Viktor Menelaos", "Zacharie Aimilios", "Nova Hogarth", "Maja Ruoho", "Uta Obando",
	"Lidiya Szekeres", "Chandra MacFarland", "Szofi Granger", "Camilla Holmström", "Lilya Stainthorpe",
	"Zofija Kendrick", "Narelle Moon", "Barbora MacLean", "Alexandra Hisakawa", "Alma María", "Rosemary Okafor",
	"Ige Behringer", "Filip Traverse", "Damjan Chapman", "Wulf Carlevaro", "Aaron Dreschner", "Kumar Dahl",
	"Eugenio Mataracı", "Ferran Simen", "Xavier Hayasaka", "Luis Moray", "Marcos Rudaski"
	],
	value="Ana Florence",
	label="Speaker Index"
	),
	gr.Dropdown(
	choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh", "ja", "hu", "ko"],
	value="en",
	label="Language"
	),
	gr.Slider(0, 2, value=1, step=0.01, label="Temperature"),
	gr.Slider(1, 100, value=50, step=1, label="Top-K"),
	gr.Slider(0, 1, value=1, step=0.01, label="Top-P"),
	gr.Slider(0.5, 2, value=1.0, step=0.01, label="Speed"),
	gr.Checkbox(value=True, label="Enable Text Splitting")
	]

	outputs = gr.Audio(label="Generated Speech")

	gr.Interface(
	fn=synthesize_tts,
	inputs=inputs,
	outputs=outputs,
	title="Text-to-Speech Synthesis with Gradio"
	).launch()