import gradio as gr import soundfile as sf import numpy as np from voxcpm import VoxCPM import tempfile import os import spaces # Load the model once at startup model = VoxCPM.from_pretrained("openbmb/VoxCPM-0.5B") @spaces.GPU(duration=120) def generate_speech( text, prompt_audio, prompt_text, cfg_value, inference_timesteps, normalize, denoise, retry_badcase, retry_badcase_max_times, retry_badcase_ratio_threshold ): if not text: gr.Warning("Please enter text to generate speech") return None # Handle prompt audio if provided prompt_wav_path = None if prompt_audio is not None: prompt_wav_path = prompt_audio # Handle empty prompt text if prompt_text and prompt_text.strip() == "": prompt_text = None try: # Generate speech wav = model.generate( text=text, prompt_wav_path=prompt_wav_path, prompt_text=prompt_text, cfg_value=cfg_value, inference_timesteps=int(inference_timesteps), normalize=normalize, denoise=denoise, retry_badcase=retry_badcase, retry_badcase_max_times=int(retry_badcase_max_times), retry_badcase_ratio_threshold=retry_badcase_ratio_threshold ) # Create temporary file for audio output with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: sf.write(tmp_file.name, wav, 16000) return tmp_file.name except Exception as e: gr.Error(f"Error generating speech: {str(e)}") return None # Create Gradio interface with gr.Blocks(title="VoxCPM Text-to-Speech", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🎙️ VoxCPM Text-to-Speech Generate highly expressive speech using VoxCPM-0.5B model. Optionally clone voices by providing reference audio. [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) """ ) with gr.Row(): with gr.Column(scale=1): # Input section text_input = gr.Textbox( label="Text to Synthesize", placeholder="Enter the text you want to convert to speech...", lines=3, value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." ) with gr.Accordion("Voice Cloning", open=False): prompt_audio = gr.Audio( label="Reference Audio (Upload a reference audio file for voice cloning)", type="filepath", sources=["upload"] ) prompt_text = gr.Textbox( label="Reference Text", placeholder="Text corresponding to the reference audio", lines=2 ) with gr.Accordion("Advanced Settings", open=False): cfg_value = gr.Slider( minimum=0.5, maximum=5.0, value=2.0, step=0.1, label="CFG Value", info="LM guidance on LocDiT, higher for better adherence to prompt" ) inference_timesteps = gr.Slider( minimum=5, maximum=50, value=10, step=1, label="Inference Timesteps", info="Higher for better quality, lower for faster speed" ) with gr.Row(): normalize = gr.Checkbox( value=True, label="Normalize", info="Enable external TN tool" ) denoise = gr.Checkbox( value=True, label="Denoise", info="Enable external Denoise tool" ) retry_badcase = gr.Checkbox( value=True, label="Retry Bad Cases", info="Enable retrying for bad cases" ) with gr.Row(): retry_badcase_max_times = gr.Number( value=3, minimum=1, maximum=10, step=1, label="Max Retry Times" ) retry_badcase_ratio_threshold = gr.Number( value=6.0, minimum=1.0, maximum=10.0, step=0.5, label="Retry Ratio Threshold" ) generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") with gr.Column(scale=1): # Output section audio_output = gr.Audio( label="Generated Speech", type="filepath", autoplay=False ) gr.Markdown( """ ### Tips: - For voice cloning, upload a clear reference audio (3-10 seconds recommended) - Higher CFG values provide better prompt adherence but may affect naturalness - Increase inference timesteps for better quality at the cost of speed - The retry mechanism helps handle edge cases automatically """ ) # Connect the generate button generate_btn.click( fn=generate_speech, inputs=[ text_input, prompt_audio, prompt_text, cfg_value, inference_timesteps, normalize, denoise, retry_badcase, retry_badcase_max_times, retry_badcase_ratio_threshold ], outputs=audio_output, show_progress="full" ) demo.launch()