import os import gradio as gr import spaces from tts_model import TTSModel import numpy as np # Set HF_HOME for faster restarts with cached models/voices os.environ["HF_HOME"] = "/data/.huggingface" # Create TTS model instance model = TTSModel() @spaces.GPU(duration=10) # Quick initialization def initialize_model(): """Initialize model and get voices""" if model.model is None: if not model.initialize(): raise gr.Error("Failed to initialize model") return model.list_voices() # Get initial voice list voice_list = initialize_model() @spaces.GPU(duration=120) # Allow 5 minutes for processing def generate_speech_from_ui(text, voice_name, speed): """Handle text-to-speech generation from the Gradio UI""" try: audio_array, duration = model.generate_speech(text, voice_name, speed) # Convert float array to int16 range (-32768 to 32767) audio_array = np.array(audio_array, dtype=np.float32) audio_array = (audio_array * 32767).astype(np.int16) return (24000, audio_array), f"Audio Duration: {duration:.2f} seconds\nProcessing complete - check console for detailed metrics" except Exception as e: raise gr.Error(str(e)) # Create Gradio interface with gr.Blocks(title="Kokoro TTS Demo") as demo: gr.HTML( """
Convert text to natural-sounding speech using various voices.