mrfakename
commited on
Commit
•
0978fba
1
Parent(s):
0559e57
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
app.py
CHANGED
@@ -79,8 +79,10 @@ def generate_response(messages, model, tokenizer):
|
|
79 |
|
80 |
|
81 |
@gpu_decorator
|
82 |
-
def infer(
|
83 |
-
|
|
|
|
|
84 |
|
85 |
if model == "F5-TTS":
|
86 |
ema_model = F5TTS_ema_model
|
@@ -94,7 +96,7 @@ def infer(ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_
|
|
94 |
ema_model,
|
95 |
cross_fade_duration=cross_fade_duration,
|
96 |
speed=speed,
|
97 |
-
show_info=
|
98 |
progress=gr.Progress(),
|
99 |
)
|
100 |
|
@@ -183,24 +185,24 @@ def parse_speechtypes_text(gen_text):
|
|
183 |
|
184 |
segments = []
|
185 |
|
186 |
-
|
187 |
|
188 |
for i in range(len(tokens)):
|
189 |
if i % 2 == 0:
|
190 |
# This is text
|
191 |
text = tokens[i].strip()
|
192 |
if text:
|
193 |
-
segments.append({"
|
194 |
else:
|
195 |
-
# This is
|
196 |
-
|
197 |
-
|
198 |
|
199 |
return segments
|
200 |
|
201 |
|
202 |
with gr.Blocks() as app_multistyle:
|
203 |
-
# New section for
|
204 |
gr.Markdown(
|
205 |
"""
|
206 |
# Multiple Speech-Type Generation
|
@@ -313,29 +315,29 @@ with gr.Blocks() as app_multistyle:
|
|
313 |
delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
|
314 |
|
315 |
# Text input for the prompt
|
316 |
-
|
317 |
label="Text to Generate",
|
318 |
lines=10,
|
319 |
placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
|
320 |
)
|
321 |
|
322 |
# Model choice
|
323 |
-
|
324 |
|
325 |
with gr.Accordion("Advanced Settings", open=False):
|
326 |
-
|
327 |
label="Remove Silences",
|
328 |
value=False,
|
329 |
)
|
330 |
|
331 |
# Generate button
|
332 |
-
|
333 |
|
334 |
# Output audio
|
335 |
-
|
336 |
|
337 |
@gpu_decorator
|
338 |
-
def
|
339 |
regular_audio,
|
340 |
regular_ref_text,
|
341 |
gen_text,
|
@@ -362,23 +364,23 @@ with gr.Blocks() as app_multistyle:
|
|
362 |
|
363 |
# For each segment, generate speech
|
364 |
generated_audio_segments = []
|
365 |
-
|
366 |
|
367 |
for segment in segments:
|
368 |
-
|
369 |
text = segment["text"]
|
370 |
|
371 |
-
if
|
372 |
-
|
373 |
else:
|
374 |
-
# If
|
375 |
-
|
376 |
|
377 |
-
ref_audio = speech_types[
|
378 |
-
ref_text = speech_types[
|
379 |
|
380 |
# Generate speech for this segment
|
381 |
-
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0)
|
382 |
sr, audio_data = audio
|
383 |
|
384 |
generated_audio_segments.append(audio_data)
|
@@ -391,21 +393,21 @@ with gr.Blocks() as app_multistyle:
|
|
391 |
gr.Warning("No audio generated.")
|
392 |
return None
|
393 |
|
394 |
-
|
395 |
-
|
396 |
inputs=[
|
397 |
regular_audio,
|
398 |
regular_ref_text,
|
399 |
-
|
400 |
]
|
401 |
+ speech_type_names
|
402 |
+ speech_type_audios
|
403 |
+ speech_type_ref_texts
|
404 |
+ [
|
405 |
-
|
406 |
-
|
407 |
],
|
408 |
-
outputs=
|
409 |
)
|
410 |
|
411 |
# Validation function to disable Generate button if speech types are missing
|
@@ -423,7 +425,7 @@ with gr.Blocks() as app_multistyle:
|
|
423 |
|
424 |
# Parse the gen_text to get the speech types used
|
425 |
segments = parse_speechtypes_text(gen_text)
|
426 |
-
speech_types_in_text = set(segment["
|
427 |
|
428 |
# Check if all speech types in text are available
|
429 |
missing_speech_types = speech_types_in_text - speech_types_available
|
@@ -435,10 +437,10 @@ with gr.Blocks() as app_multistyle:
|
|
435 |
# Enable the generate button
|
436 |
return gr.update(interactive=True)
|
437 |
|
438 |
-
|
439 |
validate_speech_types,
|
440 |
-
inputs=[
|
441 |
-
outputs=
|
442 |
)
|
443 |
|
444 |
|
@@ -576,6 +578,7 @@ Have a conversation with an AI using your reference voice!
|
|
576 |
remove_silence,
|
577 |
cross_fade_duration=0.15,
|
578 |
speed=1.0,
|
|
|
579 |
)
|
580 |
return audio_result
|
581 |
|
|
|
79 |
|
80 |
|
81 |
@gpu_decorator
|
82 |
+
def infer(
|
83 |
+
ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
|
84 |
+
):
|
85 |
+
ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
|
86 |
|
87 |
if model == "F5-TTS":
|
88 |
ema_model = F5TTS_ema_model
|
|
|
96 |
ema_model,
|
97 |
cross_fade_duration=cross_fade_duration,
|
98 |
speed=speed,
|
99 |
+
show_info=show_info,
|
100 |
progress=gr.Progress(),
|
101 |
)
|
102 |
|
|
|
185 |
|
186 |
segments = []
|
187 |
|
188 |
+
current_style = "Regular"
|
189 |
|
190 |
for i in range(len(tokens)):
|
191 |
if i % 2 == 0:
|
192 |
# This is text
|
193 |
text = tokens[i].strip()
|
194 |
if text:
|
195 |
+
segments.append({"style": current_style, "text": text})
|
196 |
else:
|
197 |
+
# This is style
|
198 |
+
style = tokens[i].strip()
|
199 |
+
current_style = style
|
200 |
|
201 |
return segments
|
202 |
|
203 |
|
204 |
with gr.Blocks() as app_multistyle:
|
205 |
+
# New section for multistyle generation
|
206 |
gr.Markdown(
|
207 |
"""
|
208 |
# Multiple Speech-Type Generation
|
|
|
315 |
delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
|
316 |
|
317 |
# Text input for the prompt
|
318 |
+
gen_text_input_multistyle = gr.Textbox(
|
319 |
label="Text to Generate",
|
320 |
lines=10,
|
321 |
placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
|
322 |
)
|
323 |
|
324 |
# Model choice
|
325 |
+
model_choice_multistyle = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
|
326 |
|
327 |
with gr.Accordion("Advanced Settings", open=False):
|
328 |
+
remove_silence_multistyle = gr.Checkbox(
|
329 |
label="Remove Silences",
|
330 |
value=False,
|
331 |
)
|
332 |
|
333 |
# Generate button
|
334 |
+
generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
|
335 |
|
336 |
# Output audio
|
337 |
+
audio_output_multistyle = gr.Audio(label="Synthesized Audio")
|
338 |
|
339 |
@gpu_decorator
|
340 |
+
def generate_multistyle_speech(
|
341 |
regular_audio,
|
342 |
regular_ref_text,
|
343 |
gen_text,
|
|
|
364 |
|
365 |
# For each segment, generate speech
|
366 |
generated_audio_segments = []
|
367 |
+
current_style = "Regular"
|
368 |
|
369 |
for segment in segments:
|
370 |
+
style = segment["style"]
|
371 |
text = segment["text"]
|
372 |
|
373 |
+
if style in speech_types:
|
374 |
+
current_style = style
|
375 |
else:
|
376 |
+
# If style not available, default to Regular
|
377 |
+
current_style = "Regular"
|
378 |
|
379 |
+
ref_audio = speech_types[current_style]["audio"]
|
380 |
+
ref_text = speech_types[current_style].get("ref_text", "")
|
381 |
|
382 |
# Generate speech for this segment
|
383 |
+
audio, _ = infer(ref_audio, ref_text, text, model_choice, remove_silence, 0, show_info=None)
|
384 |
sr, audio_data = audio
|
385 |
|
386 |
generated_audio_segments.append(audio_data)
|
|
|
393 |
gr.Warning("No audio generated.")
|
394 |
return None
|
395 |
|
396 |
+
generate_multistyle_btn.click(
|
397 |
+
generate_multistyle_speech,
|
398 |
inputs=[
|
399 |
regular_audio,
|
400 |
regular_ref_text,
|
401 |
+
gen_text_input_multistyle,
|
402 |
]
|
403 |
+ speech_type_names
|
404 |
+ speech_type_audios
|
405 |
+ speech_type_ref_texts
|
406 |
+ [
|
407 |
+
model_choice_multistyle,
|
408 |
+
remove_silence_multistyle,
|
409 |
],
|
410 |
+
outputs=audio_output_multistyle,
|
411 |
)
|
412 |
|
413 |
# Validation function to disable Generate button if speech types are missing
|
|
|
425 |
|
426 |
# Parse the gen_text to get the speech types used
|
427 |
segments = parse_speechtypes_text(gen_text)
|
428 |
+
speech_types_in_text = set(segment["style"] for segment in segments)
|
429 |
|
430 |
# Check if all speech types in text are available
|
431 |
missing_speech_types = speech_types_in_text - speech_types_available
|
|
|
437 |
# Enable the generate button
|
438 |
return gr.update(interactive=True)
|
439 |
|
440 |
+
gen_text_input_multistyle.change(
|
441 |
validate_speech_types,
|
442 |
+
inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
|
443 |
+
outputs=generate_multistyle_btn,
|
444 |
)
|
445 |
|
446 |
|
|
|
578 |
remove_silence,
|
579 |
cross_fade_duration=0.15,
|
580 |
speed=1.0,
|
581 |
+
show_info=None,
|
582 |
)
|
583 |
return audio_result
|
584 |
|