Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import soundfile as sf | |
import numpy as np | |
from voxcpm import VoxCPM | |
import tempfile | |
import os | |
import spaces | |
# Load the model once at startup | |
model = VoxCPM.from_pretrained("openbmb/VoxCPM-0.5B") | |
def generate_speech( | |
text, | |
prompt_audio, | |
prompt_text, | |
cfg_value, | |
inference_timesteps, | |
normalize, | |
denoise, | |
retry_badcase, | |
retry_badcase_max_times, | |
retry_badcase_ratio_threshold | |
): | |
if not text: | |
gr.Warning("Please enter text to generate speech") | |
return None | |
# Handle prompt audio if provided | |
prompt_wav_path = None | |
if prompt_audio is not None: | |
prompt_wav_path = prompt_audio | |
# Handle empty prompt text | |
if prompt_text and prompt_text.strip() == "": | |
prompt_text = None | |
try: | |
# Generate speech | |
wav = model.generate( | |
text=text, | |
prompt_wav_path=prompt_wav_path, | |
prompt_text=prompt_text, | |
cfg_value=cfg_value, | |
inference_timesteps=int(inference_timesteps), | |
normalize=normalize, | |
denoise=denoise, | |
retry_badcase=retry_badcase, | |
retry_badcase_max_times=int(retry_badcase_max_times), | |
retry_badcase_ratio_threshold=retry_badcase_ratio_threshold | |
) | |
# Create temporary file for audio output | |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
sf.write(tmp_file.name, wav, 16000) | |
return tmp_file.name | |
except Exception as e: | |
gr.Error(f"Error generating speech: {str(e)}") | |
return None | |
# Create Gradio interface | |
with gr.Blocks(title="VoxCPM Text-to-Speech", theme=gr.themes.Soft()) as demo: | |
gr.Markdown( | |
""" | |
# ποΈ VoxCPM Text-to-Speech | |
Generate highly expressive speech using VoxCPM-0.5B model. Optionally clone voices by providing reference audio. | |
[Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder) | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
# Input section | |
text_input = gr.Textbox( | |
label="Text to Synthesize", | |
placeholder="Enter the text you want to convert to speech...", | |
lines=3, | |
value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech." | |
) | |
with gr.Accordion("Voice Cloning", open=False): | |
prompt_audio = gr.Audio( | |
label="Reference Audio (Upload a reference audio file for voice cloning)", | |
type="filepath", | |
sources=["upload"] | |
) | |
prompt_text = gr.Textbox( | |
label="Reference Text", | |
placeholder="Text corresponding to the reference audio", | |
lines=2 | |
) | |
with gr.Accordion("Advanced Settings", open=False): | |
cfg_value = gr.Slider( | |
minimum=0.5, | |
maximum=5.0, | |
value=2.0, | |
step=0.1, | |
label="CFG Value", | |
info="LM guidance on LocDiT, higher for better adherence to prompt" | |
) | |
inference_timesteps = gr.Slider( | |
minimum=5, | |
maximum=50, | |
value=10, | |
step=1, | |
label="Inference Timesteps", | |
info="Higher for better quality, lower for faster speed" | |
) | |
with gr.Row(): | |
normalize = gr.Checkbox( | |
value=True, | |
label="Normalize", | |
info="Enable external TN tool" | |
) | |
denoise = gr.Checkbox( | |
value=True, | |
label="Denoise", | |
info="Enable external Denoise tool" | |
) | |
retry_badcase = gr.Checkbox( | |
value=True, | |
label="Retry Bad Cases", | |
info="Enable retrying for bad cases" | |
) | |
with gr.Row(): | |
retry_badcase_max_times = gr.Number( | |
value=3, | |
minimum=1, | |
maximum=10, | |
step=1, | |
label="Max Retry Times" | |
) | |
retry_badcase_ratio_threshold = gr.Number( | |
value=6.0, | |
minimum=1.0, | |
maximum=10.0, | |
step=0.5, | |
label="Retry Ratio Threshold" | |
) | |
generate_btn = gr.Button("π΅ Generate Speech", variant="primary", size="lg") | |
with gr.Column(scale=1): | |
# Output section | |
audio_output = gr.Audio( | |
label="Generated Speech", | |
type="filepath", | |
autoplay=False | |
) | |
gr.Markdown( | |
""" | |
### Tips: | |
- For voice cloning, upload a clear reference audio (3-10 seconds recommended) | |
- Higher CFG values provide better prompt adherence but may affect naturalness | |
- Increase inference timesteps for better quality at the cost of speed | |
- The retry mechanism helps handle edge cases automatically | |
""" | |
) | |
# Connect the generate button | |
generate_btn.click( | |
fn=generate_speech, | |
inputs=[ | |
text_input, | |
prompt_audio, | |
prompt_text, | |
cfg_value, | |
inference_timesteps, | |
normalize, | |
denoise, | |
retry_badcase, | |
retry_badcase_max_times, | |
retry_badcase_ratio_threshold | |
], | |
outputs=audio_output, | |
show_progress="full" | |
) | |
demo.launch() |