import os import gradio as gr import spaces from tts_model import TTSModel import numpy as np # Set HF_HOME for faster restarts with cached models/voices os.environ["HF_HOME"] = "/data/.huggingface" # Create TTS model instance model = TTSModel() @spaces.GPU(duration=10) # Quick initialization def initialize_model(): """Initialize model and get voices""" if model.model is None: if not model.initialize(): raise gr.Error("Failed to initialize model") return model.list_voices() # Get initial voice list voice_list = initialize_model() @spaces.GPU(duration=120) # Allow 5 minutes for processing def generate_speech_from_ui(text, voice_name, speed): """Handle text-to-speech generation from the Gradio UI""" try: audio_array, duration = model.generate_speech(text, voice_name, speed) # Convert float array to int16 range (-32768 to 32767) audio_array = np.array(audio_array, dtype=np.float32) audio_array = (audio_array * 32767).astype(np.int16) return (24000, audio_array), f"Audio Duration: {duration:.2f} seconds\nProcessing complete - check console for detailed metrics" except Exception as e: raise gr.Error(str(e)) # Create Gradio interface with gr.Blocks(title="Kokoro TTS Demo") as demo: gr.HTML( """

Kokoro TTS Demo

Convert text to natural-sounding speech using various voices.

""" ) with gr.Row(): with gr.Column(scale=3): # Input components text_input = gr.TextArea( label="Text to speak", placeholder="Enter text here...", lines=3, value=open("the_time_machine_hgwells.txt").read()[:1000] ) voice_dropdown = gr.Dropdown( label="Voice", choices=voice_list, value=voice_list[0] if voice_list else None, allow_custom_value=True # Allow custom values to avoid warnings ) speed_slider = gr.Slider( label="Speed", minimum=0.5, maximum=2.0, value=1.0, step=0.1 ) submit_btn = gr.Button("Generate Speech") with gr.Column(scale=2): # Output components audio_output = gr.Audio( label="Generated Speech", type="numpy", format="wav", autoplay=False ) duration_text = gr.Textbox( label="Processing Info", interactive=False, lines=4 ) # Set up event handler submit_btn.click( fn=generate_speech_from_ui, inputs=[text_input, voice_dropdown, speed_slider], outputs=[audio_output, duration_text] ) # Add voice descriptions gr.Markdown(""" ### Available Voices - Adult Female (af): Base female voice - Bella (af_bella): Warm and friendly - Nicole (af_nicole): Warm and Whispered - Sarah (af_sarah): Soft and gentle - Sky (af_sky): You know her, you love her - Adult Male (am): Base male voice - Adam (am_adam): Clear and Friendly - Michael (am_michael): Smooth and natural - Young Female (bf): - Emma (bf_emma): Sweet and cheerful - Isabella (bf_isabella): Lively and expressive - Young Male (bm): - George (bm_george): Young and energetic - Lewis (bm_lewis): Deep and confident """) # Add text analysis info with gr.Row(): with gr.Column(): gr.Markdown(""" ### Demo Text Info The demo text is loaded from H.G. Wells' "The Time Machine". This classic text demonstrates the system's ability to handle long-form content through chunking. """) text_stats = gr.Textbox( label="Text Statistics", interactive=False, value=f"Characters: {len(open('the_time_machine_hgwells.txt').read())}\nEstimated chunks: {len(open('the_time_machine_hgwells.txt').read()) // 300 + 1}" ) # Launch the app if __name__ == "__main__": demo.launch()