mrfakename commited on
Commit
03e430f
1 Parent(s): 420fb24

Sync from GitHub repo

Browse files

This Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there

Files changed (3) hide show
  1. app.py +44 -18
  2. pyproject.toml +1 -1
  3. src/f5_tts/infer/SHARED.md +12 -6
app.py CHANGED
@@ -4,6 +4,7 @@
4
  import re
5
  import tempfile
6
  from collections import OrderedDict
 
7
 
8
  import click
9
  import gradio as gr
@@ -71,6 +72,7 @@ def load_custom(ckpt_path: str, vocab_path="", model_cfg=None):
71
 
72
  F5TTS_ema_model = load_f5tts()
73
  E2TTS_ema_model = load_e2tts() if USING_SPACES else None
 
74
 
75
  chat_model_state = None
76
  chat_tokenizer_state = None
@@ -115,8 +117,11 @@ def infer(
115
  ema_model = E2TTS_ema_model
116
  elif isinstance(model, list) and model[0] == "Custom":
117
  assert not USING_SPACES, "Only official checkpoints allowed in Spaces."
118
- show_info("Loading Custom TTS model...")
119
- custom_ema_model = load_custom(model[1], vocab_path=model[2])
 
 
 
120
  ema_model = custom_ema_model
121
 
122
  final_wave, final_sample_rate, combined_spectrogram = infer_process(
@@ -739,14 +744,29 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
739
  """
740
  )
741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742
  def switch_tts_model(new_choice, custom_ckpt_path, custom_vocab_path):
743
  global tts_model_choice
744
  if new_choice == "Custom":
745
  tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path]
746
- return gr.update(visible=True)
 
 
747
  else:
748
  tts_model_choice = new_choice
749
- return gr.update(visible=False)
750
 
751
  with gr.Row():
752
  if not USING_SPACES:
@@ -757,32 +777,38 @@ If you're having issues, try converting your reference audio to WAV or MP3, clip
757
  choose_tts_model = gr.Radio(
758
  choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
759
  )
760
- with gr.Column(visible=False) as choose_custom_tts_model:
761
- custom_ckpt_path = gr.Textbox(
762
- placeholder="MODEL_CKPT: local_path | hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors",
763
- show_label=False,
764
- min_width=200,
765
- )
766
- custom_vocab_path = gr.Textbox(
767
- placeholder="VOCAB_FILE: local_path | hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt | leave blank to use default",
768
- show_label=False,
769
- min_width=200,
770
- )
 
 
 
771
 
772
  choose_tts_model.change(
773
  switch_tts_model,
774
  inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
775
- outputs=[choose_custom_tts_model],
 
776
  )
777
  custom_ckpt_path.change(
778
  switch_tts_model,
779
  inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
780
- outputs=[choose_custom_tts_model],
 
781
  )
782
  custom_vocab_path.change(
783
  switch_tts_model,
784
  inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
785
- outputs=[choose_custom_tts_model],
 
786
  )
787
 
788
  gr.TabbedInterface(
 
4
  import re
5
  import tempfile
6
  from collections import OrderedDict
7
+ from importlib.resources import files
8
 
9
  import click
10
  import gradio as gr
 
72
 
73
  F5TTS_ema_model = load_f5tts()
74
  E2TTS_ema_model = load_e2tts() if USING_SPACES else None
75
+ custom_ema_model, pre_custom_path = None, ""
76
 
77
  chat_model_state = None
78
  chat_tokenizer_state = None
 
117
  ema_model = E2TTS_ema_model
118
  elif isinstance(model, list) and model[0] == "Custom":
119
  assert not USING_SPACES, "Only official checkpoints allowed in Spaces."
120
+ global custom_ema_model, pre_custom_path
121
+ if pre_custom_path != model[1]:
122
+ show_info("Loading Custom TTS model...")
123
+ custom_ema_model = load_custom(model[1], vocab_path=model[2])
124
+ pre_custom_path = model[1]
125
  ema_model = custom_ema_model
126
 
127
  final_wave, final_sample_rate, combined_spectrogram = infer_process(
 
744
  """
745
  )
746
 
747
+ last_used_custom = files("f5_tts").joinpath("infer/.cache/last_used_custom.txt")
748
+
749
+ def load_last_used_custom():
750
+ try:
751
+ with open(last_used_custom, "r") as f:
752
+ return f.read().split(",")
753
+ except FileNotFoundError:
754
+ last_used_custom.parent.mkdir(parents=True, exist_ok=True)
755
+ return [
756
+ "hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors",
757
+ "hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt",
758
+ ]
759
+
760
  def switch_tts_model(new_choice, custom_ckpt_path, custom_vocab_path):
761
  global tts_model_choice
762
  if new_choice == "Custom":
763
  tts_model_choice = ["Custom", custom_ckpt_path, custom_vocab_path]
764
+ with open(last_used_custom, "w") as f:
765
+ f.write(f"{custom_ckpt_path},{custom_vocab_path}")
766
+ return gr.update(visible=True), gr.update(visible=True)
767
  else:
768
  tts_model_choice = new_choice
769
+ return gr.update(visible=False), gr.update(visible=False)
770
 
771
  with gr.Row():
772
  if not USING_SPACES:
 
777
  choose_tts_model = gr.Radio(
778
  choices=[DEFAULT_TTS_MODEL, "E2-TTS"], label="Choose TTS Model", value=DEFAULT_TTS_MODEL
779
  )
780
+ custom_ckpt_path = gr.Dropdown(
781
+ choices=["hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"],
782
+ value=load_last_used_custom()[0],
783
+ allow_custom_value=True,
784
+ label="MODEL CKPT: local_path | hf://user_id/repo_id/model_ckpt",
785
+ visible=False,
786
+ )
787
+ custom_vocab_path = gr.Dropdown(
788
+ choices=["hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt"],
789
+ value=load_last_used_custom()[1],
790
+ allow_custom_value=True,
791
+ label="VOCAB FILE: local_path | hf://user_id/repo_id/vocab_file",
792
+ visible=False,
793
+ )
794
 
795
  choose_tts_model.change(
796
  switch_tts_model,
797
  inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
798
+ outputs=[custom_ckpt_path, custom_vocab_path],
799
+ show_progress="hidden",
800
  )
801
  custom_ckpt_path.change(
802
  switch_tts_model,
803
  inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
804
+ outputs=[custom_ckpt_path, custom_vocab_path],
805
+ show_progress="hidden",
806
  )
807
  custom_vocab_path.change(
808
  switch_tts_model,
809
  inputs=[choose_tts_model, custom_ckpt_path, custom_vocab_path],
810
+ outputs=[custom_ckpt_path, custom_vocab_path],
811
+ show_progress="hidden",
812
  )
813
 
814
  gr.TabbedInterface(
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "f5-tts"
7
- version = "0.1.0"
8
  description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
  readme = "README.md"
10
  license = {text = "MIT License"}
 
4
 
5
  [project]
6
  name = "f5-tts"
7
+ version = "0.1.1"
8
  description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
  readme = "README.md"
10
  license = {text = "MIT License"}
src/f5_tts/infer/SHARED.md CHANGED
@@ -1,21 +1,27 @@
1
  <!-- omit in toc -->
2
  # Shared Model Cards
3
 
 
 
4
  - This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
5
  - The models in this repository are open source and are based on voluntary contributions from contributors.
6
  - The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
7
- - Welcome to pull request sharing your result here.
8
 
 
 
 
 
 
9
 
10
  <!-- omit in toc -->
11
  ### Support Language
12
  - [Multilingual](#multilingual)
13
- - [F5-TTS Base @ pretrain @ zh \& en](#f5-tts-base--pretrain--zh--en)
14
  - [Mandarin](#mandarin)
15
  - [English](#english)
16
 
17
 
18
- ### Multilingual
19
 
20
  #### F5-TTS Base @ pretrain @ zh & en
21
  |Model|🤗Hugging Face|Data (Hours)|Model License|
@@ -26,10 +32,10 @@
26
  MODEL_CKPT: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
27
  VOCAB_FILE: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
28
  ```
29
- *Other infos, e.g. Link to some sampled results, Github repo, Usage instruction, Tutorial (Blog, Video, etc.) ...*
30
 
31
- ### Mandarin
32
 
33
 
34
- ### English
35
 
 
1
  <!-- omit in toc -->
2
  # Shared Model Cards
3
 
4
+ <!-- omit in toc -->
5
+ ### **Prerequisites of using**
6
  - This document is serving as a quick lookup table for the community training/finetuning result, with various language support.
7
  - The models in this repository are open source and are based on voluntary contributions from contributors.
8
  - The use of models must be conditioned on respect for the respective creators. The convenience brought comes from their efforts.
 
9
 
10
+ <!-- omit in toc -->
11
+ ### **Welcome to share here**
12
+ - Have a pretrained/finetuned result: model checkpoint (pruned best to facilitate inference, i.e. leave only `ema_model_state_dict`) and corresponding vocab file (for tokenization).
13
+ - Host a public [huggingface model repository](https://huggingface.co/new) and upload the model related files.
14
+ - Make a pull request adding a model card to the current page, i.e. `src\f5_tts\infer\SHARED.md`.
15
 
16
  <!-- omit in toc -->
17
  ### Support Language
18
  - [Multilingual](#multilingual)
19
+ - [F5-TTS Base @ pretrain @ zh \& en](#f5-tts-base--pretrain--zh--en)
20
  - [Mandarin](#mandarin)
21
  - [English](#english)
22
 
23
 
24
+ ## Multilingual
25
 
26
  #### F5-TTS Base @ pretrain @ zh & en
27
  |Model|🤗Hugging Face|Data (Hours)|Model License|
 
32
  MODEL_CKPT: hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors
33
  VOCAB_FILE: hf://SWivid/F5-TTS/F5TTS_Base/vocab.txt
34
  ```
35
+ *Other infos, e.g. Author info, Github repo, Link to some sampled results, Usage instruction, Tutorial (Blog, Video, etc.) ...*
36
 
37
+ ## Mandarin
38
 
39
 
40
+ ## English
41