mrfakename commited on
Commit
e35df77
1 Parent(s): bc38247

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (1) hide show
  1. app.py +51 -49
app.py CHANGED
@@ -3,6 +3,7 @@
3
 
4
  import re
5
  import tempfile
 
6
 
7
  import click
8
  import gradio as gr
@@ -116,7 +117,7 @@ def infer(
116
  spectrogram_path = tmp_spectrogram.name
117
  save_spectrogram(combined_spectrogram, spectrogram_path)
118
 
119
- return (final_sample_rate, final_wave), spectrogram_path
120
 
121
 
122
  with gr.Blocks() as app_credits:
@@ -172,7 +173,7 @@ with gr.Blocks() as app_tts:
172
  cross_fade_duration_slider,
173
  speed_slider,
174
  ):
175
- return infer(
176
  ref_audio_input,
177
  ref_text_input,
178
  gen_text_input,
@@ -181,6 +182,7 @@ with gr.Blocks() as app_tts:
181
  cross_fade_duration_slider,
182
  speed_slider,
183
  )
 
184
 
185
  generate_btn.click(
186
  basic_tts,
@@ -192,7 +194,7 @@ with gr.Blocks() as app_tts:
192
  cross_fade_duration_slider,
193
  speed_slider,
194
  ],
195
- outputs=[audio_output, spectrogram_output],
196
  )
197
 
198
 
@@ -262,26 +264,26 @@ with gr.Blocks() as app_multistyle:
262
  with gr.Row():
263
  with gr.Column():
264
  regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
265
- regular_insert = gr.Button("Insert", variant="secondary")
266
  regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
267
  regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
268
 
269
- # Additional speech types (up to 99 more)
270
  max_speech_types = 100
271
- speech_type_rows = []
272
- speech_type_names = [regular_name]
273
- speech_type_audios = []
274
- speech_type_ref_texts = []
275
- speech_type_delete_btns = []
276
- speech_type_insert_btns = []
277
- speech_type_insert_btns.append(regular_insert)
278
-
279
  for i in range(max_speech_types - 1):
280
  with gr.Row(visible=False) as row:
281
  with gr.Column():
282
  name_input = gr.Textbox(label="Speech Type Name")
283
- delete_btn = gr.Button("Delete", variant="secondary")
284
- insert_btn = gr.Button("Insert", variant="secondary")
285
  audio_input = gr.Audio(label="Reference Audio", type="filepath")
286
  ref_text_input = gr.Textbox(label="Reference Text", lines=2)
287
  speech_type_rows.append(row)
@@ -295,22 +297,22 @@ with gr.Blocks() as app_multistyle:
295
  add_speech_type_btn = gr.Button("Add Speech Type")
296
 
297
  # Keep track of current number of speech types
298
- speech_type_count = gr.State(value=0)
299
 
300
  # Function to add a speech type
301
  def add_speech_type_fn(speech_type_count):
302
- if speech_type_count < max_speech_types - 1:
303
  speech_type_count += 1
304
  # Prepare updates for the rows
305
  row_updates = []
306
- for i in range(max_speech_types - 1):
307
  if i < speech_type_count:
308
  row_updates.append(gr.update(visible=True))
309
  else:
310
  row_updates.append(gr.update())
311
  else:
312
  # Optionally, show a warning
313
- row_updates = [gr.update() for _ in range(max_speech_types - 1)]
314
  return [speech_type_count] + row_updates
315
 
316
  add_speech_type_btn.click(
@@ -323,13 +325,13 @@ with gr.Blocks() as app_multistyle:
323
  # Prepare updates
324
  row_updates = []
325
 
326
- for i in range(max_speech_types - 1):
327
  if i == index:
328
  row_updates.append(gr.update(visible=False))
329
  else:
330
  row_updates.append(gr.update())
331
 
332
- speech_type_count = max(0, speech_type_count - 1)
333
 
334
  return [speech_type_count] + row_updates
335
 
@@ -367,7 +369,7 @@ with gr.Blocks() as app_multistyle:
367
  with gr.Accordion("Advanced Settings", open=False):
368
  remove_silence_multistyle = gr.Checkbox(
369
  label="Remove Silences",
370
- value=False,
371
  )
372
 
373
  # Generate button
@@ -378,25 +380,25 @@ with gr.Blocks() as app_multistyle:
378
 
379
  @gpu_decorator
380
  def generate_multistyle_speech(
381
- regular_audio,
382
- regular_ref_text,
383
  gen_text,
384
  *args,
385
  ):
386
- num_additional_speech_types = max_speech_types - 1
387
- speech_type_names_list = args[:num_additional_speech_types]
388
- speech_type_audios_list = args[num_additional_speech_types : 2 * num_additional_speech_types]
389
- speech_type_ref_texts_list = args[2 * num_additional_speech_types : 3 * num_additional_speech_types]
390
- remove_silence = args[3 * num_additional_speech_types + 1]
391
-
392
  # Collect the speech types and their audios into a dict
393
- speech_types = {"Regular": {"audio": regular_audio, "ref_text": regular_ref_text}}
394
 
 
395
  for name_input, audio_input, ref_text_input in zip(
396
  speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
397
  ):
398
  if name_input and audio_input:
399
  speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
 
 
 
400
 
401
  # Parse the gen_text into segments
402
  segments = parse_speechtypes_text(gen_text)
@@ -419,26 +421,27 @@ with gr.Blocks() as app_multistyle:
419
  ref_text = speech_types[current_style].get("ref_text", "")
420
 
421
  # Generate speech for this segment
422
- audio, _ = infer(
423
  ref_audio, ref_text, text, tts_model_choice, remove_silence, 0, show_info=print
424
  ) # show_info=print no pull to top when generating
425
- sr, audio_data = audio
426
 
427
  generated_audio_segments.append(audio_data)
 
428
 
429
  # Concatenate all audio segments
430
  if generated_audio_segments:
431
  final_audio_data = np.concatenate(generated_audio_segments)
432
- return (sr, final_audio_data)
 
 
433
  else:
434
  gr.Warning("No audio generated.")
435
- return None
436
 
437
  generate_multistyle_btn.click(
438
  generate_multistyle_speech,
439
  inputs=[
440
- regular_audio,
441
- regular_ref_text,
442
  gen_text_input_multistyle,
443
  ]
444
  + speech_type_names
@@ -447,13 +450,12 @@ with gr.Blocks() as app_multistyle:
447
  + [
448
  remove_silence_multistyle,
449
  ],
450
- outputs=audio_output_multistyle,
451
  )
452
 
453
  # Validation function to disable Generate button if speech types are missing
454
  def validate_speech_types(gen_text, regular_name, *args):
455
- num_additional_speech_types = max_speech_types - 1
456
- speech_type_names_list = args[:num_additional_speech_types]
457
 
458
  # Collect the speech types names
459
  speech_types_available = set()
@@ -561,7 +563,7 @@ Have a conversation with an AI using your reference voice!
561
  label="Type your message",
562
  lines=1,
563
  )
564
- send_btn_chat = gr.Button("Send")
565
  clear_btn_chat = gr.Button("Clear Conversation")
566
 
567
  conversation_state = gr.State(
@@ -607,7 +609,7 @@ Have a conversation with an AI using your reference voice!
607
  if not last_ai_response:
608
  return None
609
 
610
- audio_result, _ = infer(
611
  ref_audio,
612
  ref_text,
613
  last_ai_response,
@@ -617,7 +619,7 @@ Have a conversation with an AI using your reference voice!
617
  speed=1.0,
618
  show_info=print, # show_info=print no pull to top when generating
619
  )
620
- return audio_result
621
 
622
  def clear_conversation():
623
  """Reset the conversation"""
@@ -641,7 +643,7 @@ Have a conversation with an AI using your reference voice!
641
  ).then(
642
  generate_audio_response,
643
  inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
644
- outputs=[audio_output_chat],
645
  ).then(
646
  lambda: None,
647
  None,
@@ -656,7 +658,7 @@ Have a conversation with an AI using your reference voice!
656
  ).then(
657
  generate_audio_response,
658
  inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
659
- outputs=[audio_output_chat],
660
  ).then(
661
  lambda: None,
662
  None,
@@ -671,7 +673,7 @@ Have a conversation with an AI using your reference voice!
671
  ).then(
672
  generate_audio_response,
673
  inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
674
- outputs=[audio_output_chat],
675
  ).then(
676
  lambda: None,
677
  None,
@@ -702,9 +704,9 @@ This is a local web UI for F5 TTS with advanced batch processing support. This a
702
  * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
703
  * [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
704
 
705
- The checkpoints support English and Chinese.
706
 
707
- If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
708
 
709
  **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
710
  """
@@ -729,7 +731,7 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
729
 
730
  gr.TabbedInterface(
731
  [app_tts, app_multistyle, app_chat, app_credits],
732
- ["TTS", "Multi-Speech", "Voice-Chat", "Credits"],
733
  )
734
 
735
 
 
3
 
4
  import re
5
  import tempfile
6
+ from collections import OrderedDict
7
 
8
  import click
9
  import gradio as gr
 
117
  spectrogram_path = tmp_spectrogram.name
118
  save_spectrogram(combined_spectrogram, spectrogram_path)
119
 
120
+ return (final_sample_rate, final_wave), spectrogram_path, ref_text
121
 
122
 
123
  with gr.Blocks() as app_credits:
 
173
  cross_fade_duration_slider,
174
  speed_slider,
175
  ):
176
+ audio_out, spectrogram_path, ref_text_out = infer(
177
  ref_audio_input,
178
  ref_text_input,
179
  gen_text_input,
 
182
  cross_fade_duration_slider,
183
  speed_slider,
184
  )
185
+ return audio_out, spectrogram_path, gr.update(value=ref_text_out)
186
 
187
  generate_btn.click(
188
  basic_tts,
 
194
  cross_fade_duration_slider,
195
  speed_slider,
196
  ],
197
+ outputs=[audio_output, spectrogram_output, ref_text_input],
198
  )
199
 
200
 
 
264
  with gr.Row():
265
  with gr.Column():
266
  regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
267
+ regular_insert = gr.Button("Insert Label", variant="secondary")
268
  regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
269
  regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
270
 
271
+ # Regular speech type (max 100)
272
  max_speech_types = 100
273
+ speech_type_rows = [] # 99
274
+ speech_type_names = [regular_name] # 100
275
+ speech_type_audios = [regular_audio] # 100
276
+ speech_type_ref_texts = [regular_ref_text] # 100
277
+ speech_type_delete_btns = [] # 99
278
+ speech_type_insert_btns = [regular_insert] # 100
279
+
280
+ # Additional speech types (99 more)
281
  for i in range(max_speech_types - 1):
282
  with gr.Row(visible=False) as row:
283
  with gr.Column():
284
  name_input = gr.Textbox(label="Speech Type Name")
285
+ delete_btn = gr.Button("Delete Type", variant="secondary")
286
+ insert_btn = gr.Button("Insert Label", variant="secondary")
287
  audio_input = gr.Audio(label="Reference Audio", type="filepath")
288
  ref_text_input = gr.Textbox(label="Reference Text", lines=2)
289
  speech_type_rows.append(row)
 
297
  add_speech_type_btn = gr.Button("Add Speech Type")
298
 
299
  # Keep track of current number of speech types
300
+ speech_type_count = gr.State(value=1)
301
 
302
  # Function to add a speech type
303
  def add_speech_type_fn(speech_type_count):
304
+ if speech_type_count < max_speech_types:
305
  speech_type_count += 1
306
  # Prepare updates for the rows
307
  row_updates = []
308
+ for i in range(1, max_speech_types):
309
  if i < speech_type_count:
310
  row_updates.append(gr.update(visible=True))
311
  else:
312
  row_updates.append(gr.update())
313
  else:
314
  # Optionally, show a warning
315
+ row_updates = [gr.update() for _ in range(1, max_speech_types)]
316
  return [speech_type_count] + row_updates
317
 
318
  add_speech_type_btn.click(
 
325
  # Prepare updates
326
  row_updates = []
327
 
328
+ for i in range(1, max_speech_types):
329
  if i == index:
330
  row_updates.append(gr.update(visible=False))
331
  else:
332
  row_updates.append(gr.update())
333
 
334
+ speech_type_count = max(1, speech_type_count)
335
 
336
  return [speech_type_count] + row_updates
337
 
 
369
  with gr.Accordion("Advanced Settings", open=False):
370
  remove_silence_multistyle = gr.Checkbox(
371
  label="Remove Silences",
372
+ value=True,
373
  )
374
 
375
  # Generate button
 
380
 
381
  @gpu_decorator
382
  def generate_multistyle_speech(
 
 
383
  gen_text,
384
  *args,
385
  ):
386
+ speech_type_names_list = args[:max_speech_types]
387
+ speech_type_audios_list = args[max_speech_types : 2 * max_speech_types]
388
+ speech_type_ref_texts_list = args[2 * max_speech_types : 3 * max_speech_types]
389
+ remove_silence = args[3 * max_speech_types]
 
 
390
  # Collect the speech types and their audios into a dict
391
+ speech_types = OrderedDict()
392
 
393
+ ref_text_idx = 0
394
  for name_input, audio_input, ref_text_input in zip(
395
  speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
396
  ):
397
  if name_input and audio_input:
398
  speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
399
+ else:
400
+ speech_types[f"@{ref_text_idx}@"] = {"audio": "", "ref_text": ""}
401
+ ref_text_idx += 1
402
 
403
  # Parse the gen_text into segments
404
  segments = parse_speechtypes_text(gen_text)
 
421
  ref_text = speech_types[current_style].get("ref_text", "")
422
 
423
  # Generate speech for this segment
424
+ audio_out, _, ref_text_out = infer(
425
  ref_audio, ref_text, text, tts_model_choice, remove_silence, 0, show_info=print
426
  ) # show_info=print no pull to top when generating
427
+ sr, audio_data = audio_out
428
 
429
  generated_audio_segments.append(audio_data)
430
+ speech_types[current_style]["ref_text"] = ref_text_out
431
 
432
  # Concatenate all audio segments
433
  if generated_audio_segments:
434
  final_audio_data = np.concatenate(generated_audio_segments)
435
+ return [(sr, final_audio_data)] + [
436
+ gr.update(value=speech_types[style]["ref_text"]) for style in speech_types
437
+ ]
438
  else:
439
  gr.Warning("No audio generated.")
440
+ return [None] + [gr.update(value=speech_types[style]["ref_text"]) for style in speech_types]
441
 
442
  generate_multistyle_btn.click(
443
  generate_multistyle_speech,
444
  inputs=[
 
 
445
  gen_text_input_multistyle,
446
  ]
447
  + speech_type_names
 
450
  + [
451
  remove_silence_multistyle,
452
  ],
453
+ outputs=[audio_output_multistyle] + speech_type_ref_texts,
454
  )
455
 
456
  # Validation function to disable Generate button if speech types are missing
457
  def validate_speech_types(gen_text, regular_name, *args):
458
+ speech_type_names_list = args[:max_speech_types]
 
459
 
460
  # Collect the speech types names
461
  speech_types_available = set()
 
563
  label="Type your message",
564
  lines=1,
565
  )
566
+ send_btn_chat = gr.Button("Send Message")
567
  clear_btn_chat = gr.Button("Clear Conversation")
568
 
569
  conversation_state = gr.State(
 
609
  if not last_ai_response:
610
  return None
611
 
612
+ audio_result, _, ref_text_out = infer(
613
  ref_audio,
614
  ref_text,
615
  last_ai_response,
 
619
  speed=1.0,
620
  show_info=print, # show_info=print no pull to top when generating
621
  )
622
+ return audio_result, gr.update(value=ref_text_out)
623
 
624
  def clear_conversation():
625
  """Reset the conversation"""
 
643
  ).then(
644
  generate_audio_response,
645
  inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
646
+ outputs=[audio_output_chat, ref_text_chat],
647
  ).then(
648
  lambda: None,
649
  None,
 
658
  ).then(
659
  generate_audio_response,
660
  inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
661
+ outputs=[audio_output_chat, ref_text_chat],
662
  ).then(
663
  lambda: None,
664
  None,
 
673
  ).then(
674
  generate_audio_response,
675
  inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, remove_silence_chat],
676
+ outputs=[audio_output_chat, ref_text_chat],
677
  ).then(
678
  lambda: None,
679
  None,
 
704
  * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
705
  * [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
706
 
707
+ The checkpoints currently support English and Chinese.
708
 
709
+ If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s with ✂ in the bottom right corner (otherwise might have non-optimal auto-trimmed result).
710
 
711
  **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
712
  """
 
731
 
732
  gr.TabbedInterface(
733
  [app_tts, app_multistyle, app_chat, app_credits],
734
+ ["Basic-TTS", "Multi-Speech", "Voice-Chat", "Credits"],
735
  )
736
 
737