mrfakename commited on
Commit
bc38247
1 Parent(s): cbc2a64

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (2) hide show
  1. app.py +46 -19
  2. src/f5_tts/infer/utils_infer.py +13 -12
app.py CHANGED
@@ -51,6 +51,8 @@ E2TTS_ema_model = load_model(
51
  UNetT, E2TTS_model_cfg, str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
52
  )
53
 
 
 
54
  chat_model_state = None
55
  chat_tokenizer_state = None
56
 
@@ -129,7 +131,6 @@ with gr.Blocks() as app_tts:
129
  gr.Markdown("# Batched TTS")
130
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
131
  gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
132
- model_choice = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
133
  generate_btn = gr.Button("Synthesize", variant="primary")
134
  with gr.Accordion("Advanced Settings", open=False):
135
  ref_text_input = gr.Textbox(
@@ -162,13 +163,31 @@ with gr.Blocks() as app_tts:
162
  audio_output = gr.Audio(label="Synthesized Audio")
163
  spectrogram_output = gr.Image(label="Spectrogram")
164
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  generate_btn.click(
166
- infer,
167
  inputs=[
168
  ref_audio_input,
169
  ref_text_input,
170
  gen_text_input,
171
- model_choice,
172
  remove_silence,
173
  cross_fade_duration_slider,
174
  speed_slider,
@@ -345,9 +364,6 @@ with gr.Blocks() as app_multistyle:
345
  outputs=gen_text_input_multistyle,
346
  )
347
 
348
- # Model choice
349
- model_choice_multistyle = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
350
-
351
  with gr.Accordion("Advanced Settings", open=False):
352
  remove_silence_multistyle = gr.Checkbox(
353
  label="Remove Silences",
@@ -371,7 +387,6 @@ with gr.Blocks() as app_multistyle:
371
  speech_type_names_list = args[:num_additional_speech_types]
372
  speech_type_audios_list = args[num_additional_speech_types : 2 * num_additional_speech_types]
373
  speech_type_ref_texts_list = args[2 * num_additional_speech_types : 3 * num_additional_speech_types]
374
- model_choice = args[3 * num_additional_speech_types + 1]
375
  remove_silence = args[3 * num_additional_speech_types + 1]
376
 
377
  # Collect the speech types and their audios into a dict
@@ -405,7 +420,7 @@ with gr.Blocks() as app_multistyle:
405
 
406
  # Generate speech for this segment
407
  audio, _ = infer(
408
- ref_audio, ref_text, text, model_choice, remove_silence, 0, show_info=print
409
  ) # show_info=print no pull to top when generating
410
  sr, audio_data = audio
411
 
@@ -430,7 +445,6 @@ with gr.Blocks() as app_multistyle:
430
  + speech_type_audios
431
  + speech_type_ref_texts
432
  + [
433
- model_choice_multistyle,
434
  remove_silence_multistyle,
435
  ],
436
  outputs=audio_output_multistyle,
@@ -518,11 +532,6 @@ Have a conversation with an AI using your reference voice!
518
  ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
519
  with gr.Column():
520
  with gr.Accordion("Advanced Settings", open=False):
521
- model_choice_chat = gr.Radio(
522
- choices=["F5-TTS", "E2-TTS"],
523
- label="TTS Model",
524
- value="F5-TTS",
525
- )
526
  remove_silence_chat = gr.Checkbox(
527
  label="Remove Silences",
528
  value=True,
@@ -589,7 +598,7 @@ Have a conversation with an AI using your reference voice!
589
  return history, conv_state, ""
590
 
591
  @gpu_decorator
592
- def generate_audio_response(history, ref_audio, ref_text, model, remove_silence):
593
  """Generate TTS audio for AI response"""
594
  if not history or not ref_audio:
595
  return None
@@ -602,7 +611,7 @@ Have a conversation with an AI using your reference voice!
602
  ref_audio,
603
  ref_text,
604
  last_ai_response,
605
- model,
606
  remove_silence,
607
  cross_fade_duration=0.15,
608
  speed=1.0,
@@ -631,7 +640,7 @@ Have a conversation with an AI using your reference voice!
631
  outputs=[chatbot_interface, conversation_state],
632
  ).then(
633
  generate_audio_response,
634
- inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
635
  outputs=[audio_output_chat],
636
  ).then(
637
  lambda: None,
@@ -646,7 +655,7 @@ Have a conversation with an AI using your reference voice!
646
  outputs=[chatbot_interface, conversation_state],
647
  ).then(
648
  generate_audio_response,
649
- inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
650
  outputs=[audio_output_chat],
651
  ).then(
652
  lambda: None,
@@ -661,7 +670,7 @@ Have a conversation with an AI using your reference voice!
661
  outputs=[chatbot_interface, conversation_state],
662
  ).then(
663
  generate_audio_response,
664
- inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
665
  outputs=[audio_output_chat],
666
  ).then(
667
  lambda: None,
@@ -700,6 +709,24 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
700
  **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
701
  """
702
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
703
  gr.TabbedInterface(
704
  [app_tts, app_multistyle, app_chat, app_credits],
705
  ["TTS", "Multi-Speech", "Voice-Chat", "Credits"],
 
51
  UNetT, E2TTS_model_cfg, str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
52
  )
53
 
54
+ DEFAULT_TTS_MODEL = "F5-TTS"
55
+ tts_model_choice = DEFAULT_TTS_MODEL
56
  chat_model_state = None
57
  chat_tokenizer_state = None
58
 
 
131
  gr.Markdown("# Batched TTS")
132
  ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
133
  gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
 
134
  generate_btn = gr.Button("Synthesize", variant="primary")
135
  with gr.Accordion("Advanced Settings", open=False):
136
  ref_text_input = gr.Textbox(
 
163
  audio_output = gr.Audio(label="Synthesized Audio")
164
  spectrogram_output = gr.Image(label="Spectrogram")
165
 
166
+ @gpu_decorator
167
+ def basic_tts(
168
+ ref_audio_input,
169
+ ref_text_input,
170
+ gen_text_input,
171
+ remove_silence,
172
+ cross_fade_duration_slider,
173
+ speed_slider,
174
+ ):
175
+ return infer(
176
+ ref_audio_input,
177
+ ref_text_input,
178
+ gen_text_input,
179
+ tts_model_choice,
180
+ remove_silence,
181
+ cross_fade_duration_slider,
182
+ speed_slider,
183
+ )
184
+
185
  generate_btn.click(
186
+ basic_tts,
187
  inputs=[
188
  ref_audio_input,
189
  ref_text_input,
190
  gen_text_input,
 
191
  remove_silence,
192
  cross_fade_duration_slider,
193
  speed_slider,
 
364
  outputs=gen_text_input_multistyle,
365
  )
366
 
 
 
 
367
  with gr.Accordion("Advanced Settings", open=False):
368
  remove_silence_multistyle = gr.Checkbox(
369
  label="Remove Silences",
 
387
  speech_type_names_list = args[:num_additional_speech_types]
388
  speech_type_audios_list = args[num_additional_speech_types : 2 * num_additional_speech_types]
389
  speech_type_ref_texts_list = args[2 * num_additional_speech_types : 3 * num_additional_speech_types]
 
390
  remove_silence = args[3 * num_additional_speech_types + 1]
391
 
392
  # Collect the speech types and their audios into a dict
 
420
 
421
  # Generate speech for this segment
422
  audio, _ = infer(
423
+ ref_audio, ref_text, text, tts_model_choice, remove_silence, 0, show_info=print
424
  ) # show_info=print no pull to top when generating
425
  sr, audio_data = audio
426
 
 
445
  + speech_type_audios
446
  + speech_type_ref_texts
447
  + [
 
448
  remove_silence_multistyle,
449
  ],
450
  outputs=audio_output_multistyle,
 
532
  ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
533
  with gr.Column():
534
  with gr.Accordion("Advanced Settings", open=False):
 
 
 
 
 
535
  remove_silence_chat = gr.Checkbox(
536
  label="Remove Silences",
537
  value=True,
 
598
  return history, conv_state, ""
599
 
600
  @gpu_decorator
601
+ def generate_audio_response(history, ref_audio, ref_text, remove_silence):
602
  """Generate TTS audio for AI response"""
603
  if not history or not ref_audio:
604
  return None
 
611
  ref_audio,
612
  ref_text,
613
  last_ai_response,
614
+ tts_model_choice,
615
  remove_silence,
616
  cross_fade_duration=0.15,
617
  speed=1.0,
 
640
  outputs=[chatbot_interface, conversation_state],
641
  ).then(
642
  generate_audio_response,
643
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
644
  outputs=[audio_output_chat],
645
  ).then(
646
  lambda: None,
 
655
  outputs=[chatbot_interface, conversation_state],
656
  ).then(
657
  generate_audio_response,
658
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
659
  outputs=[audio_output_chat],
660
  ).then(
661
  lambda: None,
 
670
  outputs=[chatbot_interface, conversation_state],
671
  ).then(
672
  generate_audio_response,
673
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
674
  outputs=[audio_output_chat],
675
  ).then(
676
  lambda: None,
 
709
  **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
710
  """
711
  )
712
+
713
+ def switch_tts_model(new_choice):
714
+ global tts_model_choice
715
+ tts_model_choice = new_choice
716
+
717
+ if not USING_SPACES:
718
+ choose_tts_model = gr.Radio(
719
+ choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
720
+ )
721
+ else:
722
+ choose_tts_model = gr.Radio(
723
+ choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
724
+ )
725
+ choose_tts_model.change(
726
+ switch_tts_model,
727
+ inputs=choose_tts_model,
728
+ )
729
+
730
  gr.TabbedInterface(
731
  [app_tts, app_multistyle, app_chat, app_credits],
732
  ["TTS", "Multi-Speech", "Voice-Chat", "Credits"],
src/f5_tts/infer/utils_infer.py CHANGED
@@ -282,13 +282,13 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
282
  audio_data = audio_file.read()
283
  audio_hash = hashlib.md5(audio_data).hexdigest()
284
 
285
- global _ref_audio_cache
286
- if audio_hash in _ref_audio_cache:
287
- # Use cached reference text
288
- show_info("Using cached reference text...")
289
- ref_text = _ref_audio_cache[audio_hash]
290
- else:
291
- if not ref_text.strip():
292
  global asr_pipe
293
  if asr_pipe is None:
294
  initialize_asr_pipeline(device=device)
@@ -300,11 +300,10 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
300
  generate_kwargs={"task": "transcribe"},
301
  return_timestamps=False,
302
  )["text"].strip()
303
- show_info("Finished transcription")
304
- else:
305
- show_info("Using custom reference text...")
306
- # Cache the transcribed text
307
- _ref_audio_cache[audio_hash] = ref_text
308
 
309
  # Ensure ref_text ends with a proper sentence-ending punctuation
310
  if not ref_text.endswith(". ") and not ref_text.endswith("。"):
@@ -313,6 +312,8 @@ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_in
313
  else:
314
  ref_text += ". "
315
 
 
 
316
  return ref_audio, ref_text
317
 
318
 
 
282
  audio_data = audio_file.read()
283
  audio_hash = hashlib.md5(audio_data).hexdigest()
284
 
285
+ if not ref_text.strip():
286
+ global _ref_audio_cache
287
+ if audio_hash in _ref_audio_cache:
288
+ # Use cached asr transcription
289
+ show_info("Using cached reference text...")
290
+ ref_text = _ref_audio_cache[audio_hash]
291
+ else:
292
  global asr_pipe
293
  if asr_pipe is None:
294
  initialize_asr_pipeline(device=device)
 
300
  generate_kwargs={"task": "transcribe"},
301
  return_timestamps=False,
302
  )["text"].strip()
303
+ # Cache the transcribed text (not caching custom ref_text, enabling users to do manual tweak)
304
+ _ref_audio_cache[audio_hash] = ref_text
305
+ else:
306
+ show_info("Using custom reference text...")
 
307
 
308
  # Ensure ref_text ends with a proper sentence-ending punctuation
309
  if not ref_text.endswith(". ") and not ref_text.endswith("。"):
 
312
  else:
313
  ref_text += ". "
314
 
315
+ print("ref_text ", ref_text)
316
+
317
  return ref_audio, ref_text
318
 
319