Bubarino commited on
Commit
2cfd52d
1 Parent(s): 145adb7

Delete infer_gradio.py

Browse files
Files changed (1) hide show
  1. infer_gradio.py +0 -730
infer_gradio.py DELETED
@@ -1,730 +0,0 @@
1
- # ruff: noqa: E402
2
- # Above allows ruff to ignore E402: module level import not at top of file
3
-
4
- import re
5
- import tempfile
6
-
7
- import click
8
- import gradio as gr
9
- import numpy as np
10
- import soundfile as sf
11
- import torchaudio
12
- from cached_path import cached_path
13
- from transformers import AutoModelForCausalLM, AutoTokenizer
14
-
15
- try:
16
- import spaces
17
-
18
- USING_SPACES = True
19
- except ImportError:
20
- USING_SPACES = False
21
-
22
-
23
- def gpu_decorator(func):
24
- if USING_SPACES:
25
- return spaces.GPU(func)
26
- else:
27
- return func
28
-
29
-
30
- from f5_tts.model import DiT, UNetT
31
- from f5_tts.infer.utils_infer import (
32
- load_vocoder,
33
- load_model,
34
- preprocess_ref_audio_text,
35
- infer_process,
36
- remove_silence_for_generated_wav,
37
- save_spectrogram,
38
- )
39
-
40
- vocoder = load_vocoder()
41
-
42
-
43
- # load models
44
- F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
45
- F5TTS_ema_model = load_model(
46
- DiT, F5TTS_model_cfg, str(cached_path("/content/model_467k.safetensors"))
47
- )
48
-
49
- E2TTS_model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
50
- E2TTS_ema_model = load_model(
51
- UNetT, E2TTS_model_cfg, str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
52
- )
53
-
54
- chat_model_state = None
55
- chat_tokenizer_state = None
56
-
57
-
58
- @gpu_decorator
59
- def generate_response(messages, model, tokenizer):
60
- """Generate response using Qwen"""
61
- text = tokenizer.apply_chat_template(
62
- messages,
63
- tokenize=False,
64
- add_generation_prompt=True,
65
- )
66
-
67
- model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
68
- generated_ids = model.generate(
69
- **model_inputs,
70
- max_new_tokens=512,
71
- temperature=0.7,
72
- top_p=0.95,
73
- )
74
-
75
- generated_ids = [
76
- output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
77
- ]
78
- return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
79
-
80
-
81
- @gpu_decorator
82
- def infer(
83
- ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
84
- ):
85
- ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
86
-
87
- if model == "F5-TTS":
88
- ema_model = F5TTS_ema_model
89
- elif model == "E2-TTS":
90
- ema_model = E2TTS_ema_model
91
-
92
- final_wave, final_sample_rate, combined_spectrogram = infer_process(
93
- ref_audio,
94
- ref_text,
95
- gen_text,
96
- ema_model,
97
- vocoder,
98
- cross_fade_duration=cross_fade_duration,
99
- speed=speed,
100
- show_info=show_info,
101
- progress=gr.Progress(),
102
- )
103
-
104
- # Remove silence
105
- if remove_silence:
106
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
107
- sf.write(f.name, final_wave, final_sample_rate)
108
- remove_silence_for_generated_wav(f.name)
109
- final_wave, _ = torchaudio.load(f.name)
110
- final_wave = final_wave.squeeze().cpu().numpy()
111
-
112
- # Save the spectrogram
113
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
114
- spectrogram_path = tmp_spectrogram.name
115
- save_spectrogram(combined_spectrogram, spectrogram_path)
116
-
117
- return (final_sample_rate, final_wave), spectrogram_path
118
-
119
-
120
- with gr.Blocks() as app_credits:
121
- gr.Markdown("""
122
- # Credits
123
-
124
- * [mrfakename](https://github.com/fakerybakery) for the original [online demo](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
125
- * [RootingInLoad](https://github.com/RootingInLoad) for initial chunk generation and podcast app exploration
126
- * [jpgallegoar](https://github.com/jpgallegoar) for multiple speech-type generation & voice chat
127
- """)
128
- with gr.Blocks() as app_tts:
129
- gr.Markdown("# Batched TTS")
130
- ref_audio_input = gr.Audio(label="Reference Audio", type="filepath")
131
- gen_text_input = gr.Textbox(label="Text to Generate", lines=10)
132
- model_choice = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
133
- generate_btn = gr.Button("Synthesize", variant="primary")
134
- with gr.Accordion("Advanced Settings", open=False):
135
- ref_text_input = gr.Textbox(
136
- label="Reference Text",
137
- info="Leave blank to automatically transcribe the reference audio. If you enter text it will override automatic transcription.",
138
- lines=2,
139
- )
140
- remove_silence = gr.Checkbox(
141
- label="Remove Silences",
142
- info="The model tends to produce silences, especially on longer audio. We can manually remove silences if needed. Note that this is an experimental feature and may produce strange results. This will also increase generation time.",
143
- value=False,
144
- )
145
- speed_slider = gr.Slider(
146
- label="Speed",
147
- minimum=0.3,
148
- maximum=2.0,
149
- value=1.0,
150
- step=0.1,
151
- info="Adjust the speed of the audio.",
152
- )
153
- cross_fade_duration_slider = gr.Slider(
154
- label="Cross-Fade Duration (s)",
155
- minimum=0.0,
156
- maximum=1.0,
157
- value=0.15,
158
- step=0.01,
159
- info="Set the duration of the cross-fade between audio clips.",
160
- )
161
-
162
- audio_output = gr.Audio(label="Synthesized Audio")
163
- spectrogram_output = gr.Image(label="Spectrogram")
164
-
165
- generate_btn.click(
166
- infer,
167
- inputs=[
168
- ref_audio_input,
169
- ref_text_input,
170
- gen_text_input,
171
- model_choice,
172
- remove_silence,
173
- cross_fade_duration_slider,
174
- speed_slider,
175
- ],
176
- outputs=[audio_output, spectrogram_output],
177
- )
178
-
179
-
180
- def parse_speechtypes_text(gen_text):
181
- # Pattern to find {speechtype}
182
- pattern = r"\{(.*?)\}"
183
-
184
- # Split the text by the pattern
185
- tokens = re.split(pattern, gen_text)
186
-
187
- segments = []
188
-
189
- current_style = "Regular"
190
-
191
- for i in range(len(tokens)):
192
- if i % 2 == 0:
193
- # This is text
194
- text = tokens[i].strip()
195
- if text:
196
- segments.append({"style": current_style, "text": text})
197
- else:
198
- # This is style
199
- style = tokens[i].strip()
200
- current_style = style
201
-
202
- return segments
203
-
204
-
205
- with gr.Blocks() as app_multistyle:
206
- # New section for multistyle generation
207
- gr.Markdown(
208
- """
209
- # Multiple Speech-Type Generation
210
-
211
- This section allows you to generate multiple speech types or multiple people's voices. Enter your text in the format shown below, and the system will generate speech using the appropriate type. If unspecified, the model will use the regular speech type. The current speech type will be used until the next speech type is specified.
212
- """
213
- )
214
-
215
- with gr.Row():
216
- gr.Markdown(
217
- """
218
- **Example Input:**
219
- {Regular} Hello, I'd like to order a sandwich please.
220
- {Surprised} What do you mean you're out of bread?
221
- {Sad} I really wanted a sandwich though...
222
- {Angry} You know what, darn you and your little shop!
223
- {Whisper} I'll just go back home and cry now.
224
- {Shouting} Why me?!
225
- """
226
- )
227
-
228
- gr.Markdown(
229
- """
230
- **Example Input 2:**
231
- {Speaker1_Happy} Hello, I'd like to order a sandwich please.
232
- {Speaker2_Regular} Sorry, we're out of bread.
233
- {Speaker1_Sad} I really wanted a sandwich though...
234
- {Speaker2_Whisper} I'll give you the last one I was hiding.
235
- """
236
- )
237
-
238
- gr.Markdown(
239
- "Upload different audio clips for each speech type. The first speech type is mandatory. You can add additional speech types by clicking the 'Add Speech Type' button."
240
- )
241
-
242
- # Regular speech type (mandatory)
243
- with gr.Row():
244
- with gr.Column():
245
- regular_name = gr.Textbox(value="Regular", label="Speech Type Name")
246
- regular_insert = gr.Button("Insert", variant="secondary")
247
- regular_audio = gr.Audio(label="Regular Reference Audio", type="filepath")
248
- regular_ref_text = gr.Textbox(label="Reference Text (Regular)", lines=2)
249
-
250
- # Additional speech types (up to 99 more)
251
- max_speech_types = 100
252
- speech_type_rows = []
253
- speech_type_names = [regular_name]
254
- speech_type_audios = []
255
- speech_type_ref_texts = []
256
- speech_type_delete_btns = []
257
- speech_type_insert_btns = []
258
- speech_type_insert_btns.append(regular_insert)
259
-
260
- for i in range(max_speech_types - 1):
261
- with gr.Row(visible=False) as row:
262
- with gr.Column():
263
- name_input = gr.Textbox(label="Speech Type Name")
264
- delete_btn = gr.Button("Delete", variant="secondary")
265
- insert_btn = gr.Button("Insert", variant="secondary")
266
- audio_input = gr.Audio(label="Reference Audio", type="filepath")
267
- ref_text_input = gr.Textbox(label="Reference Text", lines=2)
268
- speech_type_rows.append(row)
269
- speech_type_names.append(name_input)
270
- speech_type_audios.append(audio_input)
271
- speech_type_ref_texts.append(ref_text_input)
272
- speech_type_delete_btns.append(delete_btn)
273
- speech_type_insert_btns.append(insert_btn)
274
-
275
- # Button to add speech type
276
- add_speech_type_btn = gr.Button("Add Speech Type")
277
-
278
- # Keep track of current number of speech types
279
- speech_type_count = gr.State(value=0)
280
-
281
- # Function to add a speech type
282
- def add_speech_type_fn(speech_type_count):
283
- if speech_type_count < max_speech_types - 1:
284
- speech_type_count += 1
285
- # Prepare updates for the rows
286
- row_updates = []
287
- for i in range(max_speech_types - 1):
288
- if i < speech_type_count:
289
- row_updates.append(gr.update(visible=True))
290
- else:
291
- row_updates.append(gr.update())
292
- else:
293
- # Optionally, show a warning
294
- row_updates = [gr.update() for _ in range(max_speech_types - 1)]
295
- return [speech_type_count] + row_updates
296
-
297
- add_speech_type_btn.click(
298
- add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows
299
- )
300
-
301
- # Function to delete a speech type
302
- def make_delete_speech_type_fn(index):
303
- def delete_speech_type_fn(speech_type_count):
304
- # Prepare updates
305
- row_updates = []
306
-
307
- for i in range(max_speech_types - 1):
308
- if i == index:
309
- row_updates.append(gr.update(visible=False))
310
- else:
311
- row_updates.append(gr.update())
312
-
313
- speech_type_count = max(0, speech_type_count - 1)
314
-
315
- return [speech_type_count] + row_updates
316
-
317
- return delete_speech_type_fn
318
-
319
- # Update delete button clicks
320
- for i, delete_btn in enumerate(speech_type_delete_btns):
321
- delete_fn = make_delete_speech_type_fn(i)
322
- delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
323
-
324
- # Text input for the prompt
325
- gen_text_input_multistyle = gr.Textbox(
326
- label="Text to Generate",
327
- lines=10,
328
- placeholder="Enter the script with speaker names (or emotion types) at the start of each block, e.g.:\n\n{Regular} Hello, I'd like to order a sandwich please.\n{Surprised} What do you mean you're out of bread?\n{Sad} I really wanted a sandwich though...\n{Angry} You know what, darn you and your little shop!\n{Whisper} I'll just go back home and cry now.\n{Shouting} Why me?!",
329
- )
330
-
331
- def make_insert_speech_type_fn(index):
332
- def insert_speech_type_fn(current_text, speech_type_name):
333
- current_text = current_text or ""
334
- speech_type_name = speech_type_name or "None"
335
- updated_text = current_text + f"{{{speech_type_name}}} "
336
- return gr.update(value=updated_text)
337
-
338
- return insert_speech_type_fn
339
-
340
- for i, insert_btn in enumerate(speech_type_insert_btns):
341
- insert_fn = make_insert_speech_type_fn(i)
342
- insert_btn.click(
343
- insert_fn,
344
- inputs=[gen_text_input_multistyle, speech_type_names[i]],
345
- outputs=gen_text_input_multistyle,
346
- )
347
-
348
- # Model choice
349
- model_choice_multistyle = gr.Radio(choices=["F5-TTS", "E2-TTS"], label="Choose TTS Model", value="F5-TTS")
350
-
351
- with gr.Accordion("Advanced Settings", open=False):
352
- remove_silence_multistyle = gr.Checkbox(
353
- label="Remove Silences",
354
- value=False,
355
- )
356
-
357
- # Generate button
358
- generate_multistyle_btn = gr.Button("Generate Multi-Style Speech", variant="primary")
359
-
360
- # Output audio
361
- audio_output_multistyle = gr.Audio(label="Synthesized Audio")
362
-
363
- @gpu_decorator
364
- def generate_multistyle_speech(
365
- regular_audio,
366
- regular_ref_text,
367
- gen_text,
368
- *args,
369
- ):
370
- num_additional_speech_types = max_speech_types - 1
371
- speech_type_names_list = args[:num_additional_speech_types]
372
- speech_type_audios_list = args[num_additional_speech_types : 2 * num_additional_speech_types]
373
- speech_type_ref_texts_list = args[2 * num_additional_speech_types : 3 * num_additional_speech_types]
374
- model_choice = args[3 * num_additional_speech_types + 1]
375
- remove_silence = args[3 * num_additional_speech_types + 1]
376
-
377
- # Collect the speech types and their audios into a dict
378
- speech_types = {"Regular": {"audio": regular_audio, "ref_text": regular_ref_text}}
379
-
380
- for name_input, audio_input, ref_text_input in zip(
381
- speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
382
- ):
383
- if name_input and audio_input:
384
- speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
385
-
386
- # Parse the gen_text into segments
387
- segments = parse_speechtypes_text(gen_text)
388
-
389
- # For each segment, generate speech
390
- generated_audio_segments = []
391
- current_style = "Regular"
392
-
393
- for segment in segments:
394
- style = segment["style"]
395
- text = segment["text"]
396
-
397
- if style in speech_types:
398
- current_style = style
399
- else:
400
- # If style not available, default to Regular
401
- current_style = "Regular"
402
-
403
- ref_audio = speech_types[current_style]["audio"]
404
- ref_text = speech_types[current_style].get("ref_text", "")
405
-
406
- # Generate speech for this segment
407
- audio, _ = infer(
408
- ref_audio, ref_text, text, model_choice, remove_silence, 0, show_info=print
409
- ) # show_info=print no pull to top when generating
410
- sr, audio_data = audio
411
-
412
- generated_audio_segments.append(audio_data)
413
-
414
- # Concatenate all audio segments
415
- if generated_audio_segments:
416
- final_audio_data = np.concatenate(generated_audio_segments)
417
- return (sr, final_audio_data)
418
- else:
419
- gr.Warning("No audio generated.")
420
- return None
421
-
422
- generate_multistyle_btn.click(
423
- generate_multistyle_speech,
424
- inputs=[
425
- regular_audio,
426
- regular_ref_text,
427
- gen_text_input_multistyle,
428
- ]
429
- + speech_type_names
430
- + speech_type_audios
431
- + speech_type_ref_texts
432
- + [
433
- model_choice_multistyle,
434
- remove_silence_multistyle,
435
- ],
436
- outputs=audio_output_multistyle,
437
- )
438
-
439
- # Validation function to disable Generate button if speech types are missing
440
- def validate_speech_types(gen_text, regular_name, *args):
441
- num_additional_speech_types = max_speech_types - 1
442
- speech_type_names_list = args[:num_additional_speech_types]
443
-
444
- # Collect the speech types names
445
- speech_types_available = set()
446
- if regular_name:
447
- speech_types_available.add(regular_name)
448
- for name_input in speech_type_names_list:
449
- if name_input:
450
- speech_types_available.add(name_input)
451
-
452
- # Parse the gen_text to get the speech types used
453
- segments = parse_speechtypes_text(gen_text)
454
- speech_types_in_text = set(segment["style"] for segment in segments)
455
-
456
- # Check if all speech types in text are available
457
- missing_speech_types = speech_types_in_text - speech_types_available
458
-
459
- if missing_speech_types:
460
- # Disable the generate button
461
- return gr.update(interactive=False)
462
- else:
463
- # Enable the generate button
464
- return gr.update(interactive=True)
465
-
466
- gen_text_input_multistyle.change(
467
- validate_speech_types,
468
- inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
469
- outputs=generate_multistyle_btn,
470
- )
471
-
472
-
473
- with gr.Blocks() as app_chat:
474
- gr.Markdown(
475
- """
476
- # Voice Chat
477
- Have a conversation with an AI using your reference voice!
478
- 1. Upload a reference audio clip and optionally its transcript.
479
- 2. Load the chat model.
480
- 3. Record your message through your microphone.
481
- 4. The AI will respond using the reference voice.
482
- """
483
- )
484
-
485
- if not USING_SPACES:
486
- load_chat_model_btn = gr.Button("Load Chat Model", variant="primary")
487
-
488
- chat_interface_container = gr.Column(visible=False)
489
-
490
- @gpu_decorator
491
- def load_chat_model():
492
- global chat_model_state, chat_tokenizer_state
493
- if chat_model_state is None:
494
- show_info = gr.Info
495
- show_info("Loading chat model...")
496
- model_name = "Qwen/Qwen2.5-3B-Instruct"
497
- chat_model_state = AutoModelForCausalLM.from_pretrained(
498
- model_name, torch_dtype="auto", device_map="auto"
499
- )
500
- chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
501
- show_info("Chat model loaded.")
502
-
503
- return gr.update(visible=False), gr.update(visible=True)
504
-
505
- load_chat_model_btn.click(load_chat_model, outputs=[load_chat_model_btn, chat_interface_container])
506
-
507
- else:
508
- chat_interface_container = gr.Column()
509
-
510
- if chat_model_state is None:
511
- model_name = "Qwen/Qwen2.5-3B-Instruct"
512
- chat_model_state = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
513
- chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
514
-
515
- with chat_interface_container:
516
- with gr.Row():
517
- with gr.Column():
518
- ref_audio_chat = gr.Audio(label="Reference Audio", type="filepath")
519
- with gr.Column():
520
- with gr.Accordion("Advanced Settings", open=False):
521
- model_choice_chat = gr.Radio(
522
- choices=["F5-TTS", "E2-TTS"],
523
- label="TTS Model",
524
- value="F5-TTS",
525
- )
526
- remove_silence_chat = gr.Checkbox(
527
- label="Remove Silences",
528
- value=True,
529
- )
530
- ref_text_chat = gr.Textbox(
531
- label="Reference Text",
532
- info="Optional: Leave blank to auto-transcribe",
533
- lines=2,
534
- )
535
- system_prompt_chat = gr.Textbox(
536
- label="System Prompt",
537
- value="You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
538
- lines=2,
539
- )
540
-
541
- chatbot_interface = gr.Chatbot(label="Conversation")
542
-
543
- with gr.Row():
544
- with gr.Column():
545
- audio_input_chat = gr.Microphone(
546
- label="Speak your message",
547
- type="filepath",
548
- )
549
- audio_output_chat = gr.Audio(autoplay=True)
550
- with gr.Column():
551
- text_input_chat = gr.Textbox(
552
- label="Type your message",
553
- lines=1,
554
- )
555
- send_btn_chat = gr.Button("Send")
556
- clear_btn_chat = gr.Button("Clear Conversation")
557
-
558
- conversation_state = gr.State(
559
- value=[
560
- {
561
- "role": "system",
562
- "content": "You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
563
- }
564
- ]
565
- )
566
-
567
- # Modify process_audio_input to use model and tokenizer from state
568
- @gpu_decorator
569
- def process_audio_input(audio_path, text, history, conv_state):
570
- """Handle audio or text input from user"""
571
-
572
- if not audio_path and not text.strip():
573
- return history, conv_state, ""
574
-
575
- if audio_path:
576
- text = preprocess_ref_audio_text(audio_path, text)[1]
577
-
578
- if not text.strip():
579
- return history, conv_state, ""
580
-
581
- conv_state.append({"role": "user", "content": text})
582
- history.append((text, None))
583
-
584
- response = generate_response(conv_state, chat_model_state, chat_tokenizer_state)
585
-
586
- conv_state.append({"role": "assistant", "content": response})
587
- history[-1] = (text, response)
588
-
589
- return history, conv_state, ""
590
-
591
- @gpu_decorator
592
- def generate_audio_response(history, ref_audio, ref_text, model, remove_silence):
593
- """Generate TTS audio for AI response"""
594
- if not history or not ref_audio:
595
- return None
596
-
597
- last_user_message, last_ai_response = history[-1]
598
- if not last_ai_response:
599
- return None
600
-
601
- audio_result, _ = infer(
602
- ref_audio,
603
- ref_text,
604
- last_ai_response,
605
- model,
606
- remove_silence,
607
- cross_fade_duration=0.15,
608
- speed=1.0,
609
- show_info=print, # show_info=print no pull to top when generating
610
- )
611
- return audio_result
612
-
613
- def clear_conversation():
614
- """Reset the conversation"""
615
- return [], [
616
- {
617
- "role": "system",
618
- "content": "You are not an AI assistant, you are whoever the user says you are. You must stay in character. Keep your responses concise since they will be spoken out loud.",
619
- }
620
- ]
621
-
622
- def update_system_prompt(new_prompt):
623
- """Update the system prompt and reset the conversation"""
624
- new_conv_state = [{"role": "system", "content": new_prompt}]
625
- return [], new_conv_state
626
-
627
- # Handle audio input
628
- audio_input_chat.stop_recording(
629
- process_audio_input,
630
- inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
631
- outputs=[chatbot_interface, conversation_state],
632
- ).then(
633
- generate_audio_response,
634
- inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
635
- outputs=[audio_output_chat],
636
- ).then(
637
- lambda: None,
638
- None,
639
- audio_input_chat,
640
- )
641
-
642
- # Handle text input
643
- text_input_chat.submit(
644
- process_audio_input,
645
- inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
646
- outputs=[chatbot_interface, conversation_state],
647
- ).then(
648
- generate_audio_response,
649
- inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
650
- outputs=[audio_output_chat],
651
- ).then(
652
- lambda: None,
653
- None,
654
- text_input_chat,
655
- )
656
-
657
- # Handle send button
658
- send_btn_chat.click(
659
- process_audio_input,
660
- inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
661
- outputs=[chatbot_interface, conversation_state],
662
- ).then(
663
- generate_audio_response,
664
- inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
665
- outputs=[audio_output_chat],
666
- ).then(
667
- lambda: None,
668
- None,
669
- text_input_chat,
670
- )
671
-
672
- # Handle clear button
673
- clear_btn_chat.click(
674
- clear_conversation,
675
- outputs=[chatbot_interface, conversation_state],
676
- )
677
-
678
- # Handle system prompt change and reset conversation
679
- system_prompt_chat.change(
680
- update_system_prompt,
681
- inputs=system_prompt_chat,
682
- outputs=[chatbot_interface, conversation_state],
683
- )
684
-
685
-
686
- with gr.Blocks() as app:
687
- gr.Markdown(
688
- """
689
- # E2/F5 TTS
690
-
691
- This is a local web UI for F5 TTS with advanced batch processing support. This app supports the following TTS models:
692
-
693
- * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
694
- * [E2 TTS](https://arxiv.org/abs/2406.18009) (Embarrassingly Easy Fully Non-Autoregressive Zero-Shot TTS)
695
-
696
- The checkpoints support English and Chinese.
697
-
698
- If you're having issues, try converting your reference audio to WAV or MP3, clipping it to 15s, and shortening your prompt.
699
-
700
- **NOTE: Reference text will be automatically transcribed with Whisper if not provided. For best results, keep your reference clips short (<15s). Ensure the audio is fully uploaded before generating.**
701
- """
702
- )
703
- gr.TabbedInterface(
704
- [app_tts, app_multistyle, app_chat, app_credits],
705
- ["TTS", "Multi-Speech", "Voice-Chat", "Credits"],
706
- )
707
-
708
-
709
- @click.command()
710
- @click.option("--port", "-p", default=None, type=int, help="Port to run the app on")
711
- @click.option("--host", "-H", default=None, help="Host to run the app on")
712
- @click.option(
713
- "--share",
714
- "-s",
715
- default=False,
716
- is_flag=True,
717
- help="Share the app via Gradio share link",
718
- )
719
- @click.option("--api", "-a", default=True, is_flag=True, help="Allow API access")
720
- def main(port, host, share, api):
721
- global app
722
- print("Starting app...")
723
- app.queue(api_open=api).launch(server_name=host, server_port=port, share=share, show_api=api)
724
-
725
-
726
- if __name__ == "__main__":
727
- if not USING_SPACES:
728
- main()
729
- else:
730
- app.queue().launch()