VoxCPM-0.5B / app.py
akhaliq's picture
akhaliq HF Staff
Update app.py
2ce4e07 verified
import gradio as gr
import soundfile as sf
import numpy as np
from voxcpm import VoxCPM
import tempfile
import os
import spaces
# Load the model once at startup
model = VoxCPM.from_pretrained("openbmb/VoxCPM-0.5B")
@spaces.GPU(duration=120)
def generate_speech(
text,
prompt_audio,
prompt_text,
cfg_value,
inference_timesteps,
normalize,
denoise,
retry_badcase,
retry_badcase_max_times,
retry_badcase_ratio_threshold
):
if not text:
gr.Warning("Please enter text to generate speech")
return None
# Handle prompt audio if provided
prompt_wav_path = None
if prompt_audio is not None:
prompt_wav_path = prompt_audio
# Handle empty prompt text
if prompt_text and prompt_text.strip() == "":
prompt_text = None
try:
# Generate speech
wav = model.generate(
text=text,
prompt_wav_path=prompt_wav_path,
prompt_text=prompt_text,
cfg_value=cfg_value,
inference_timesteps=int(inference_timesteps),
normalize=normalize,
denoise=denoise,
retry_badcase=retry_badcase,
retry_badcase_max_times=int(retry_badcase_max_times),
retry_badcase_ratio_threshold=retry_badcase_ratio_threshold
)
# Create temporary file for audio output
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
sf.write(tmp_file.name, wav, 16000)
return tmp_file.name
except Exception as e:
gr.Error(f"Error generating speech: {str(e)}")
return None
# Create Gradio interface
with gr.Blocks(title="VoxCPM Text-to-Speech", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# πŸŽ™οΈ VoxCPM Text-to-Speech
Generate highly expressive speech using VoxCPM-0.5B model. Optionally clone voices by providing reference audio.
[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
"""
)
with gr.Row():
with gr.Column(scale=1):
# Input section
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter the text you want to convert to speech...",
lines=3,
value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech."
)
with gr.Accordion("Voice Cloning", open=False):
prompt_audio = gr.Audio(
label="Reference Audio (Upload a reference audio file for voice cloning)",
type="filepath",
sources=["upload"]
)
prompt_text = gr.Textbox(
label="Reference Text",
placeholder="Text corresponding to the reference audio",
lines=2
)
with gr.Accordion("Advanced Settings", open=False):
cfg_value = gr.Slider(
minimum=0.5,
maximum=5.0,
value=2.0,
step=0.1,
label="CFG Value",
info="LM guidance on LocDiT, higher for better adherence to prompt"
)
inference_timesteps = gr.Slider(
minimum=5,
maximum=50,
value=10,
step=1,
label="Inference Timesteps",
info="Higher for better quality, lower for faster speed"
)
with gr.Row():
normalize = gr.Checkbox(
value=True,
label="Normalize",
info="Enable external TN tool"
)
denoise = gr.Checkbox(
value=True,
label="Denoise",
info="Enable external Denoise tool"
)
retry_badcase = gr.Checkbox(
value=True,
label="Retry Bad Cases",
info="Enable retrying for bad cases"
)
with gr.Row():
retry_badcase_max_times = gr.Number(
value=3,
minimum=1,
maximum=10,
step=1,
label="Max Retry Times"
)
retry_badcase_ratio_threshold = gr.Number(
value=6.0,
minimum=1.0,
maximum=10.0,
step=0.5,
label="Retry Ratio Threshold"
)
generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
# Output section
audio_output = gr.Audio(
label="Generated Speech",
type="filepath",
autoplay=False
)
gr.Markdown(
"""
### Tips:
- For voice cloning, upload a clear reference audio (3-10 seconds recommended)
- Higher CFG values provide better prompt adherence but may affect naturalness
- Increase inference timesteps for better quality at the cost of speed
- The retry mechanism helps handle edge cases automatically
"""
)
# Connect the generate button
generate_btn.click(
fn=generate_speech,
inputs=[
text_input,
prompt_audio,
prompt_text,
cfg_value,
inference_timesteps,
normalize,
denoise,
retry_badcase,
retry_badcase_max_times,
retry_badcase_ratio_threshold
],
outputs=audio_output,
show_progress="full"
)
demo.launch()