刘鑫 commited on
Commit
8bf7b95
·
1 Parent(s): 608ef95
Files changed (5) hide show
  1. .gitattributes +9 -0
  2. app.py +557 -0
  3. assets/voxcpm-logo.png +3 -0
  4. examples/example.wav +3 -0
  5. requirements.txt +5 -0
.gitattributes CHANGED
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
37
+ *.png filter=lfs diff=lfs merge=lfs -text
38
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
39
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
40
+ *.avi filter=lfs diff=lfs merge=lfs -text
41
+ *.mov filter=lfs diff=lfs merge=lfs -text
42
+ *.gif filter=lfs diff=lfs merge=lfs -text
43
+ *.jpg filter=lfs diff=lfs merge=lfs -text
44
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import logging
4
+ import traceback
5
+ import numpy as np
6
+ import gradio as gr
7
+ from typing import Optional, Tuple
8
+ import soundfile as sf
9
+ from pathlib import Path
10
+ import requests
11
+ import json
12
+ import base64
13
+ import io
14
+ import tempfile
15
+ import uuid
16
+ import time
17
+
18
+ # 配置日志
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(asctime)s - %(levelname)s - %(message)s',
22
+ handlers=[
23
+ logging.StreamHandler(sys.stdout),
24
+ logging.FileHandler('app.log', mode='a', encoding='utf-8')
25
+ ]
26
+ )
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # 启动日志
30
+ logger.info("="*50)
31
+ logger.info("🚀 VoxCPM应用启动中...")
32
+ logger.info(f"Python版本: {sys.version}")
33
+ logger.info(f"工作目录: {os.getcwd()}")
34
+ logger.info(f"环境变量PORT: {os.environ.get('PORT', '未设置')}")
35
+ logger.info(f"环境变量RAY_SERVE_URL: {os.environ.get('RAY_SERVE_URL', '未设置')}")
36
+ logger.info("="*50)
37
+
38
+
39
+ class RayServeVoxCPMClient:
40
+ """Client wrapper that talks to Ray Serve TTS API."""
41
+
42
+ def __init__(self) -> None:
43
+ logger.info("📡 初始化RayServeVoxCPMClient...")
44
+
45
+ try:
46
+ # Ray Serve API URL (can be overridden via env)
47
+ self.RAY_SERVE_DEFAULT_URL = "https://d09162224-pytorch251-cuda124-u-5512-iyr4lse3-8970.550c.cloud"
48
+ self.api_url = self._resolve_server_url()
49
+ logger.info(f"🔗 准备连接到Ray Serve API: {self.api_url}")
50
+
51
+ # Test connection
52
+ logger.info("⏳ 测试Ray Serve连接...")
53
+ health_start = time.time()
54
+ health_response = requests.get(f"{self.api_url}/health", timeout=10)
55
+ health_response.raise_for_status()
56
+ health_time = time.time() - health_start
57
+ logger.info(f"✅ 成功连接到Ray Serve API: {self.api_url} (耗时: {health_time:.3f}秒)")
58
+
59
+ except Exception as e:
60
+ logger.error(f"❌ 初始化RayServeVoxCPMClient失败: {e}")
61
+ logger.error(f"错误详情: {traceback.format_exc()}")
62
+ raise
63
+
64
+ # ----------- Helpers -----------
65
+ def _resolve_server_url(self) -> str:
66
+ """Resolve Ray Serve API base URL, prefer env RAY_SERVE_URL."""
67
+ return os.environ.get("RAY_SERVE_URL", self.RAY_SERVE_DEFAULT_URL).rstrip("/")
68
+
69
+ def _audio_file_to_base64(self, audio_file_path: str) -> str:
70
+ """
71
+ 将音频文件转换为base64编码
72
+
73
+ Args:
74
+ audio_file_path: 音频文件路径
75
+
76
+ Returns:
77
+ base64编码的音频数据
78
+ """
79
+ try:
80
+ with open(audio_file_path, 'rb') as f:
81
+ audio_bytes = f.read()
82
+ return base64.b64encode(audio_bytes).decode('utf-8')
83
+ except Exception as e:
84
+ logger.error(f"音频文件转base64失败: {e}")
85
+ raise
86
+
87
+ def _base64_to_audio_array(self, base64_audio: str, sample_rate: int = 16000) -> Tuple[int, np.ndarray]:
88
+ """
89
+ 将base64编码的音频转换为numpy数组
90
+
91
+ Args:
92
+ base64_audio: base64编码的音频数据
93
+ sample_rate: 期望的采样率
94
+
95
+ Returns:
96
+ (sample_rate, audio_array) tuple
97
+ """
98
+ try:
99
+ # 解码base64
100
+ audio_bytes = base64.b64decode(base64_audio)
101
+
102
+ # 创建临时文件
103
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
104
+ tmp_file.write(audio_bytes)
105
+ tmp_file_path = tmp_file.name
106
+
107
+ # 读取音频文件
108
+ try:
109
+ audio_data, sr = sf.read(tmp_file_path, dtype='float32')
110
+
111
+ # 转换为单声道
112
+ if audio_data.ndim == 2:
113
+ audio_data = audio_data[:, 0]
114
+
115
+ # 转换为int16格式(Gradio期望的格式)
116
+ audio_int16 = (audio_data * 32767).astype(np.int16)
117
+
118
+ return sr, audio_int16
119
+ finally:
120
+ # 清理临时文件
121
+ try:
122
+ os.unlink(tmp_file_path)
123
+ except:
124
+ pass
125
+
126
+ except Exception as e:
127
+ logger.error(f"base64转音频数组失败: {e}")
128
+ raise
129
+
130
+ # ----------- Functional endpoints -----------
131
+ def prompt_wav_recognition(self, prompt_wav: Optional[str]) -> str:
132
+ """Use Ray Serve ASR API for speech recognition."""
133
+ logger.info(f"🎵 开始语音识别,输入文件: {prompt_wav}")
134
+
135
+ if prompt_wav is None or not prompt_wav.strip():
136
+ logger.info("⚠️ 没有提供音频文件,跳过语音识别")
137
+ return ""
138
+
139
+ try:
140
+ start_time = time.time()
141
+ logger.info(f"📁 处理音频文件: {prompt_wav}")
142
+
143
+ # 将音频文件转换为base64
144
+ convert_start = time.time()
145
+ audio_base64 = self._audio_file_to_base64(prompt_wav)
146
+ convert_time = time.time() - convert_start
147
+ logger.info(f"🔄 音频转base64耗时: {convert_time:.3f}秒")
148
+
149
+ logger.info("📡 调用Ray Serve ASR API...")
150
+
151
+ # 构建ASR请求
152
+ asr_request = {
153
+ "reqid": str(uuid.uuid4()),
154
+ "audio_data": audio_base64,
155
+ "language": "auto",
156
+ "use_itn": True
157
+ }
158
+
159
+ # 调用ASR接口
160
+ api_start = time.time()
161
+ response = requests.post(
162
+ f"{self.api_url}/asr",
163
+ json=asr_request,
164
+ headers={"Content-Type": "application/json"},
165
+ timeout=30
166
+ )
167
+ response.raise_for_status()
168
+ api_time = time.time() - api_start
169
+
170
+ result_data = response.json()
171
+ total_time = time.time() - start_time
172
+
173
+ logger.info(f"⏱️ ASR API请求耗时: {api_time:.3f}秒")
174
+ logger.info(f"⏱️ ASR总耗时: {total_time:.3f}秒")
175
+ logger.info(f"✅ 语音识别完成,响应: {result_data}")
176
+
177
+ # 检查响应状态
178
+ if result_data.get("code") == 3000:
179
+ recognized_text = result_data.get("text", "")
180
+ logger.info(f"🎯 识别结果: '{recognized_text}'")
181
+ return recognized_text
182
+ else:
183
+ logger.warning(f"⚠️ ASR识别失败: {result_data.get('message', 'Unknown error')}")
184
+ return ""
185
+
186
+ except Exception as e:
187
+ logger.error(f"❌ 语音识别失败: {e}")
188
+ logger.error(f"错误详情: {traceback.format_exc()}")
189
+ return ""
190
+
191
+ def _call_ray_serve_generate(
192
+ self,
193
+ text: str,
194
+ prompt_wav_path: Optional[str] = None,
195
+ prompt_text: Optional[str] = None,
196
+ cfg_value: float = 2.0,
197
+ inference_timesteps: int = 10,
198
+ do_normalize: bool = True,
199
+ denoise: bool = True,
200
+ ) -> Tuple[int, np.ndarray]:
201
+ """
202
+ Call Ray Serve /generate API and return (sample_rate, waveform).
203
+ """
204
+ logger.info(f"🔥 调用Ray Serve生成API,文本: '{text[:60]}...'")
205
+
206
+ try:
207
+ start_time = time.time()
208
+
209
+ # 构建请求数据
210
+ prepare_start = time.time()
211
+ audio_config = {
212
+ "voice_type": "default", # 使用默认模式,或者可以根据需要调整
213
+ "encoding": "wav",
214
+ "speed_ratio": 1.0,
215
+ "cfg_value": cfg_value,
216
+ "inference_timesteps": inference_timesteps
217
+ }
218
+
219
+ # 如果有参考音频和文本,使用voice-clone模式
220
+ if prompt_wav_path and prompt_text:
221
+ logger.info("🎭 使用语音克隆模式")
222
+ convert_start = time.time()
223
+ audio_base64 = self._audio_file_to_base64(prompt_wav_path)
224
+ convert_time = time.time() - convert_start
225
+ logger.info(f"🔄 参考音频转base64耗时: {convert_time:.3f}秒")
226
+
227
+ audio_config.update({
228
+ "voice_type": None, # 清除voice_type,使用克隆模式
229
+ "prompt_wav": audio_base64,
230
+ "prompt_text": prompt_text
231
+ })
232
+ else:
233
+ logger.info("🎤 使用默认语音模式")
234
+
235
+ request_data = {
236
+ "audio": audio_config,
237
+ "request": {
238
+ "reqid": str(uuid.uuid4()),
239
+ "text": text,
240
+ "operation": "query",
241
+ "do_normalize": do_normalize,
242
+ "denoise": denoise
243
+ }
244
+ }
245
+ prepare_time = time.time() - prepare_start
246
+ logger.info(f"⏱️ 请求数据准备耗时: {prepare_time:.3f}秒")
247
+
248
+ logger.info(f"📡 发送请求到Ray Serve: {self.api_url}/generate")
249
+ logger.info(f"📊 请求参数: CFG={cfg_value}, 推理步数={inference_timesteps}, 文本长度={len(text)}")
250
+
251
+ # 调用生成接口
252
+ api_start = time.time()
253
+ response = requests.post(
254
+ f"{self.api_url}/generate",
255
+ json=request_data,
256
+ headers={"Content-Type": "application/json"},
257
+ timeout=120 # TTS可能需要较长时间
258
+ )
259
+ response.raise_for_status()
260
+ api_time = time.time() - api_start
261
+
262
+ result_data = response.json()
263
+ logger.info(f"⏱️ TTS API请求耗时: {api_time:.3f}秒")
264
+ logger.info(f"✅ Ray Serve响应: code={result_data.get('code')}, message={result_data.get('message')}")
265
+
266
+ # 检查响应状态
267
+ if result_data.get("code") == 3000:
268
+ # 成功生成音频
269
+ audio_base64 = result_data.get("data", "")
270
+ if not audio_base64:
271
+ raise RuntimeError("Ray Serve返回的音频数据为空")
272
+
273
+ # 将base64音频转换为numpy数组
274
+ decode_start = time.time()
275
+ sample_rate, audio_array = self._base64_to_audio_array(audio_base64)
276
+ decode_time = time.time() - decode_start
277
+ total_time = time.time() - start_time
278
+
279
+ duration_ms = result_data.get('addition', {}).get('duration', 'unknown')
280
+ logger.info(f"🔄 音频解码耗时: {decode_time:.3f}秒")
281
+ logger.info(f"⏱️ TTS总耗时: {total_time:.3f}秒")
282
+ logger.info(f"🎵 音频生成成功,采样率: {sample_rate}, 时长: {duration_ms}ms")
283
+ logger.info(f"📈 性能指标: API={api_time:.3f}s, 解码={decode_time:.3f}s, 总计={total_time:.3f}s")
284
+
285
+ return sample_rate, audio_array
286
+ else:
287
+ error_msg = result_data.get("message", "Unknown error")
288
+ raise RuntimeError(f"Ray Serve生成失败: {error_msg}")
289
+
290
+ except requests.exceptions.RequestException as e:
291
+ logger.error(f"❌ Ray Serve请求失败: {e}")
292
+ raise RuntimeError(f"Failed to connect Ray Serve TTS service: {e}. Check RAY_SERVE_URL='{self.api_url}' and service status")
293
+ except Exception as e:
294
+ logger.error(f"❌ Ray Serve调用异常: {e}")
295
+ raise
296
+
297
+ def generate_tts_audio(
298
+ self,
299
+ text_input: str,
300
+ prompt_wav_path_input: Optional[str] = None,
301
+ prompt_text_input: Optional[str] = None,
302
+ cfg_value_input: float = 2.0,
303
+ inference_timesteps_input: int = 10,
304
+ do_normalize: bool = True,
305
+ denoise: bool = True,
306
+ ) -> Tuple[int, np.ndarray]:
307
+ logger.info("🎤 开始TTS音频生成...")
308
+ logger.info(f"📝 输入文本: '{text_input[:60]}{'...' if len(text_input) > 60 else ''}'")
309
+ logger.info(f"🎵 参考音频: {prompt_wav_path_input or '无'}")
310
+ logger.info(f"📄 参考文本: '{prompt_text_input[:30]}{'...' if prompt_text_input and len(prompt_text_input) > 30 else ''}' " if prompt_text_input else "无")
311
+ logger.info(f"⚙️ CFG值: {cfg_value_input}, 推理步数: {inference_timesteps_input}")
312
+ logger.info(f"🔧 文本正规化: {do_normalize}, 音频降噪: {denoise}")
313
+
314
+ try:
315
+ full_start_time = time.time()
316
+
317
+ text = (text_input or "").strip()
318
+ if len(text) == 0:
319
+ logger.error("❌ 输入文本为空")
320
+ raise ValueError("Please input text to synthesize.")
321
+
322
+ prompt_wav_path = prompt_wav_path_input or ""
323
+ prompt_text = prompt_text_input or ""
324
+ cfg_value = cfg_value_input if cfg_value_input is not None else 2.0
325
+ inference_timesteps = inference_timesteps_input if inference_timesteps_input is not None else 10
326
+
327
+ logger.info("🚀 调用Ray Serve TTS生成引擎...")
328
+ generate_start = time.time()
329
+ sr, wav_np = self._call_ray_serve_generate(
330
+ text=text,
331
+ prompt_wav_path=prompt_wav_path,
332
+ prompt_text=prompt_text,
333
+ cfg_value=cfg_value,
334
+ inference_timesteps=inference_timesteps,
335
+ do_normalize=do_normalize,
336
+ denoise=denoise,
337
+ )
338
+ generate_time = time.time() - generate_start
339
+ full_time = time.time() - full_start_time
340
+
341
+ logger.info(f"✅ TTS生成完成,采样率: {sr}, 音频长度: {len(wav_np) if hasattr(wav_np, '__len__') else 'unknown'}")
342
+ logger.info(f"🏁 完整TTS流程耗时: {full_time:.3f}秒 (生成={generate_time:.3f}s)")
343
+ return (sr, wav_np)
344
+
345
+ except Exception as e:
346
+ logger.error(f"❌ TTS音频生成失败: {e}")
347
+ logger.error(f"错误详情: {traceback.format_exc()}")
348
+ raise
349
+
350
+
351
+ # ---------- UI Builders ----------
352
+
353
+ def create_demo_interface(client: RayServeVoxCPMClient):
354
+ """Build the Gradio UI for Gradio API VoxCPM client."""
355
+ logger.info("🎨 开始创建Gradio界面...")
356
+
357
+ try:
358
+ assets_path = Path.cwd().absolute()/"assets"
359
+ logger.info(f"📁 设置静态资源路径: {assets_path}")
360
+ gr.set_static_paths(paths=[assets_path])
361
+ logger.info("✅ 静态资源路径设置完成")
362
+ except Exception as e:
363
+ logger.warning(f"⚠️ 静态资源路径设置失败: {e}")
364
+ logger.warning("继续创建界面...")
365
+
366
+ with gr.Blocks(
367
+ theme=gr.themes.Soft(
368
+ primary_hue="blue",
369
+ secondary_hue="gray",
370
+ neutral_hue="slate",
371
+ font=[gr.themes.GoogleFont("Inter"), "Arial", "sans-serif"]
372
+ ),
373
+ css="""
374
+ .logo-container {
375
+ text-align: center;
376
+ margin: 0.5rem 0 1rem 0;
377
+ }
378
+ .logo-container img {
379
+ height: 80px;
380
+ width: auto;
381
+ max-width: 200px;
382
+ display: inline-block;
383
+ }
384
+ /* Bold labels for specific checkboxes */
385
+ #chk_denoise label,
386
+ #chk_denoise span,
387
+ #chk_normalize label,
388
+ #chk_normalize span {
389
+ font-weight: 600;
390
+ }
391
+ """
392
+ ) as interface:
393
+ gr.HTML('<div class="logo-container"><img src="/gradio_api/file=assets/voxcpm-logo.png" alt="VoxCPM Logo"></div>')
394
+
395
+ # Quick Start
396
+ with gr.Accordion("📋 Quick Start Guide | 快速入门", open=False):
397
+ gr.Markdown("""
398
+ ### How to Use |使用说明
399
+ 1. **(Optional) Provide a Voice Prompt** - Upload or record an audio clip to provide the desired voice characteristics for synthesis.
400
+ **(可选)提供参考声音** - 上传或录制一段音频,为声音合成提供音色、语调和情感等个性化特征
401
+ 2. **(Optional) Enter prompt text** - If you provided a voice prompt, enter the corresponding transcript here (auto-recognition available).
402
+ **(可选项)输入参考文本** - 如果提供了参考语音,请输入其对应的文本内容(支持自动识别)。
403
+ 3. **Enter target text** - Type the text you want the model to speak.
404
+ **输入目标文本** - 输入您希望模型朗读的文字内容。
405
+ 4. **Generate Speech** - Click the "Generate" button to create your audio.
406
+ **生成语音** - 点击"生成"按钮,即可为您创造出音频。
407
+ """)
408
+
409
+ # Pro Tips
410
+ with gr.Accordion("💡 Pro Tips |使用建议", open=False):
411
+ gr.Markdown("""
412
+ ### Prompt Speech Enhancement|参考语音降噪
413
+ - **Enable** to remove background noise for a clean, studio-like voice, with an external ZipEnhancer component.
414
+ **启用**:通过 ZipEnhancer 组件消除背景噪音,获得更好的音质。
415
+ - **Disable** to preserve the original audio's background atmosphere.
416
+ **禁用**:保留原始音频的背景环境声,如果想复刻相应声学环境。
417
+
418
+ ### Text Normalization|文本正则化
419
+ - **Enable** to process general text with an external WeTextProcessing component.
420
+ **启用**:使用 WeTextProcessing 组件,可处理常见文本。
421
+ - **Disable** to use VoxCPM's native text understanding ability. For example, it supports phonemes input ({HH AH0 L OW1}), try it!
422
+ **禁用**:将使用 VoxCPM 内置的文本理解能力。如,支持音素输入(如 {da4}{jia1}好)和公式符号合成,尝试一下!
423
+
424
+ ### CFG Value|CFG 值
425
+ - **Lower CFG** if the voice prompt sounds strained or expressive.
426
+ **调低**:如果提示语音听起来不自然或过于夸张。
427
+ - **Higher CFG** for better adherence to the prompt speech style or input text.
428
+ **调高**:为更好地贴合提示音频的风格或输入文本。
429
+
430
+ ### Inference Timesteps|推理时间步
431
+ - **Lower** for faster synthesis speed.
432
+ **调低**:合成速度更快。
433
+ - **Higher** for better synthesis quality.
434
+ **调高**:合成质量更佳。
435
+
436
+ ### Long Text (e.g., >5 min speech)|长文本 (如 >5分钟的合成语音)
437
+ While VoxCPM can handle long texts directly, we recommend using empty lines to break very long content into paragraphs; the model will then synthesize each paragraph individually.
438
+ 虽然 VoxCPM 支持直接生成长文本,但如果目标文本过长,我们建议使用换行符将内容分段;模型将对每个段落分别合成。
439
+ """)
440
+
441
+ with gr.Row():
442
+ with gr.Column():
443
+ prompt_wav = gr.Audio(
444
+ sources=["upload", 'microphone'],
445
+ type="filepath",
446
+ label="Prompt Speech",
447
+ value="examples/example.wav"
448
+ )
449
+ DoDenoisePromptAudio = gr.Checkbox(
450
+ value=False,
451
+ label="Prompt Speech Enhancement",
452
+ elem_id="chk_denoise",
453
+ info="We use ZipEnhancer model to denoise the prompt audio."
454
+ )
455
+ with gr.Row():
456
+ prompt_text = gr.Textbox(
457
+ value="Just by listening a few minutes a day, you'll be able to eliminate negative thoughts by conditioning your mind to be more positive.",
458
+ label="Prompt Text",
459
+ placeholder="Please enter the prompt text. Automatic recognition is supported, and you can correct the results yourself..."
460
+ )
461
+ run_btn = gr.Button("Generate Speech", variant="primary")
462
+
463
+ with gr.Column():
464
+ cfg_value = gr.Slider(
465
+ minimum=1.0,
466
+ maximum=3.0,
467
+ value=2.0,
468
+ step=0.1,
469
+ label="CFG Value (Guidance Scale)",
470
+ info="Higher values increase adherence to prompt, lower values allow more creativity"
471
+ )
472
+ inference_timesteps = gr.Slider(
473
+ minimum=4,
474
+ maximum=30,
475
+ value=10,
476
+ step=1,
477
+ label="Inference Timesteps",
478
+ info="Number of inference timesteps for generation (higher values may improve quality but slower)"
479
+ )
480
+ with gr.Row():
481
+ text = gr.Textbox(
482
+ value="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech.",
483
+ label="Target Text",
484
+ info="Default processing splits text on \\n into paragraphs; each is synthesized as a chunk and then concatenated into the final audio."
485
+ )
486
+ with gr.Row():
487
+ DoNormalizeText = gr.Checkbox(
488
+ value=False,
489
+ label="Text Normalization",
490
+ elem_id="chk_normalize",
491
+ info="We use WeTextPorcessing library to normalize the input text."
492
+ )
493
+ audio_output = gr.Audio(label="Output Audio")
494
+
495
+ # Wire events
496
+ run_btn.click(
497
+ fn=client.generate_tts_audio,
498
+ inputs=[text, prompt_wav, prompt_text, cfg_value, inference_timesteps, DoNormalizeText, DoDenoisePromptAudio],
499
+ outputs=[audio_output],
500
+ show_progress=True,
501
+ api_name="generate",
502
+ concurrency_limit=None,
503
+ )
504
+ prompt_wav.change(fn=client.prompt_wav_recognition, inputs=[prompt_wav], outputs=[prompt_text])
505
+
506
+ logger.info("🔗 事件绑定完成")
507
+
508
+ logger.info("✅ Gradio界面构建完成")
509
+ return interface
510
+
511
+
512
+ def run_demo():
513
+ """启动演示应用"""
514
+ logger.info("🚀 开始启动VoxCPM演示应用...")
515
+
516
+ try:
517
+ # 创建客户端
518
+ logger.info("📡 创建Ray Serve API客户端...")
519
+ client = RayServeVoxCPMClient()
520
+ logger.info("✅ Ray Serve API客户端创建成功")
521
+
522
+ # 创建界面
523
+ logger.info("🎨 创建Gradio界面...")
524
+ interface = create_demo_interface(client)
525
+ logger.info("✅ Gradio界面创建成功")
526
+
527
+ # 获取端口配置
528
+ port = int(os.environ.get('PORT', 7860))
529
+ logger.info(f"🌐 准备在端口 {port} 启动服务...")
530
+
531
+ # 启动应用
532
+ logger.info("🚀 启动Gradio应用...")
533
+ interface.launch(
534
+ server_port=port,
535
+ server_name="0.0.0.0",
536
+ show_error=True,
537
+ )
538
+ logger.info("✅ 应用启动成功!")
539
+
540
+ except Exception as e:
541
+ logger.error(f"❌ 应用启动失败: {e}")
542
+ logger.error(f"错误详情: {traceback.format_exc()}")
543
+ sys.exit(1)
544
+
545
+
546
+ if __name__ == "__main__":
547
+ try:
548
+ logger.info("🎬 开始执行主程序...")
549
+ run_demo()
550
+ except KeyboardInterrupt:
551
+ logger.info("⏹️ 收到中断信号,正在退出...")
552
+ except Exception as e:
553
+ logger.error(f"💥 主程序异常退出: {e}")
554
+ logger.error(f"错误详情: {traceback.format_exc()}")
555
+ sys.exit(1)
556
+ finally:
557
+ logger.info("🔚 程序结束")
assets/voxcpm-logo.png ADDED

Git LFS Details

  • SHA256: 7b90525ab45f7b303eba72c7b0375ae25d81c7233e890ef6358feacc479239f1
  • Pointer size: 130 Bytes
  • Size of remote file: 24.5 kB
examples/example.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:009638e7474ac4eb2ca5b23d28d9114c33377eb5c91e8d6f7000a0c36d6eaa8e
3
+ size 1439096
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Core dependencies
2
+ gradio>=4.0.0
3
+ requests>=2.25.0
4
+ numpy>=1.21.0
5
+ soundfile>=0.12.1