Gregniuki commited on
Commit
986683c
1 Parent(s): 9c7ea36

Upload 7 files

Browse files
Files changed (7) hide show
  1. .gitignore +173 -0
  2. .gitmodules +3 -0
  3. .pre-commit-config.yaml +14 -0
  4. LICENSE +21 -0
  5. README.md +171 -12
  6. pyproject.toml +61 -0
  7. ruff.toml +10 -0
.gitignore ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Customed
2
+ .vscode/
3
+ tests/
4
+ runs/
5
+ data/
6
+ ckpts/
7
+ wandb/
8
+ results/
9
+
10
+
11
+
12
+ # Byte-compiled / optimized / DLL files
13
+ __pycache__/
14
+ *.py[cod]
15
+ *$py.class
16
+
17
+ # C extensions
18
+ *.so
19
+
20
+ # Distribution / packaging
21
+ .Python
22
+ build/
23
+ develop-eggs/
24
+ dist/
25
+ downloads/
26
+ eggs/
27
+ .eggs/
28
+ lib/
29
+ lib64/
30
+ parts/
31
+ sdist/
32
+ var/
33
+ wheels/
34
+ share/python-wheels/
35
+ *.egg-info/
36
+ .installed.cfg
37
+ *.egg
38
+ MANIFEST
39
+
40
+ # PyInstaller
41
+ # Usually these files are written by a python script from a template
42
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
43
+ *.manifest
44
+ *.spec
45
+
46
+ # Installer logs
47
+ pip-log.txt
48
+ pip-delete-this-directory.txt
49
+
50
+ # Unit test / coverage reports
51
+ htmlcov/
52
+ .tox/
53
+ .nox/
54
+ .coverage
55
+ .coverage.*
56
+ .cache
57
+ nosetests.xml
58
+ coverage.xml
59
+ *.cover
60
+ *.py,cover
61
+ .hypothesis/
62
+ .pytest_cache/
63
+ cover/
64
+
65
+ # Translations
66
+ *.mo
67
+ *.pot
68
+
69
+ # Django stuff:
70
+ *.log
71
+ local_settings.py
72
+ db.sqlite3
73
+ db.sqlite3-journal
74
+
75
+ # Flask stuff:
76
+ instance/
77
+ .webassets-cache
78
+
79
+ # Scrapy stuff:
80
+ .scrapy
81
+
82
+ # Sphinx documentation
83
+ docs/_build/
84
+
85
+ # PyBuilder
86
+ .pybuilder/
87
+ target/
88
+
89
+ # Jupyter Notebook
90
+ .ipynb_checkpoints
91
+
92
+ # IPython
93
+ profile_default/
94
+ ipython_config.py
95
+
96
+ # pyenv
97
+ # For a library or package, you might want to ignore these files since the code is
98
+ # intended to run in multiple environments; otherwise, check them in:
99
+ # .python-version
100
+
101
+ # pipenv
102
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
104
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
105
+ # install all needed dependencies.
106
+ #Pipfile.lock
107
+
108
+ # poetry
109
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
110
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
111
+ # commonly ignored for libraries.
112
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
113
+ #poetry.lock
114
+
115
+ # pdm
116
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
117
+ #pdm.lock
118
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
119
+ # in version control.
120
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
121
+ .pdm.toml
122
+ .pdm-python
123
+ .pdm-build/
124
+
125
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
126
+ __pypackages__/
127
+
128
+ # Celery stuff
129
+ celerybeat-schedule
130
+ celerybeat.pid
131
+
132
+ # SageMath parsed files
133
+ *.sage.py
134
+
135
+ # Environments
136
+ .env
137
+ .venv
138
+ env/
139
+ venv/
140
+ ENV/
141
+ env.bak/
142
+ venv.bak/
143
+
144
+ # Spyder project settings
145
+ .spyderproject
146
+ .spyproject
147
+
148
+ # Rope project settings
149
+ .ropeproject
150
+
151
+ # mkdocs documentation
152
+ /site
153
+
154
+ # mypy
155
+ .mypy_cache/
156
+ .dmypy.json
157
+ dmypy.json
158
+
159
+ # Pyre type checker
160
+ .pyre/
161
+
162
+ # pytype static type analyzer
163
+ .pytype/
164
+
165
+ # Cython debug symbols
166
+ cython_debug/
167
+
168
+ # PyCharm
169
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
170
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
171
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
172
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
173
+ #.idea/
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "src/third_party/BigVGAN"]
2
+ path = src/third_party/BigVGAN
3
+ url = https://github.com/NVIDIA/BigVGAN.git
.pre-commit-config.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ # Ruff version.
4
+ rev: v0.7.0
5
+ hooks:
6
+ # Run the linter.
7
+ - id: ruff
8
+ args: [--fix]
9
+ # Run the formatter.
10
+ - id: ruff-format
11
+ - repo: https://github.com/pre-commit/pre-commit-hooks
12
+ rev: v2.3.0
13
+ hooks:
14
+ - id: check-yaml
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Yushen CHEN
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,171 @@
1
- ---
2
- title: F5-tts Polish English German
3
- emoji: 🐢
4
- colorFrom: green
5
- colorTo: red
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: voice cloner
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching
2
+
3
+ [![python](https://img.shields.io/badge/Python-3.10-brightgreen)](https://github.com/SWivid/F5-TTS)
4
+ [![arXiv](https://img.shields.io/badge/arXiv-2410.06885-b31b1b.svg?logo=arXiv)](https://arxiv.org/abs/2410.06885)
5
+ [![demo](https://img.shields.io/badge/GitHub-Demo%20page-orange.svg)](https://swivid.github.io/F5-TTS/)
6
+ [![hfspace](https://img.shields.io/badge/🤗-Space%20demo-yellow)](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
7
+ [![msspace](https://img.shields.io/badge/🤖-Space%20demo-blue)](https://modelscope.cn/studios/modelscope/E2-F5-TTS)
8
+ [![lab](https://img.shields.io/badge/X--LANCE-Lab-grey?labelColor=lightgrey)](https://x-lance.sjtu.edu.cn/)
9
+ <img src="https://github.com/user-attachments/assets/12d7749c-071a-427c-81bf-b87b91def670" alt="Watermark" style="width: 40px; height: auto">
10
+
11
+ **F5-TTS**: Diffusion Transformer with ConvNeXt V2, faster trained and inference.
12
+
13
+ **E2 TTS**: Flat-UNet Transformer, closest reproduction from [paper](https://arxiv.org/abs/2406.18009).
14
+
15
+ **Sway Sampling**: Inference-time flow step sampling strategy, greatly improves performance
16
+
17
+ ### Thanks to all the contributors !
18
+
19
+ ## News
20
+ - **2024/10/08**: F5-TTS & E2 TTS base models on [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS), [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), [🟣 Wisemodel](https://wisemodel.cn/models/SJTU_X-LANCE/F5-TTS_Emilia-ZH-EN).
21
+
22
+ ## Installation
23
+
24
+ ```bash
25
+ # Create a python 3.10 conda env (you could also use virtualenv)
26
+ conda create -n f5-tts python=3.10
27
+ conda activate f5-tts
28
+
29
+ # Install pytorch with your CUDA version, e.g.
30
+ pip install torch==2.3.0+cu118 torchaudio==2.3.0+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
31
+ ```
32
+
33
+ Then you can choose from a few options below:
34
+
35
+ ### 1. As a pip package (if just for inference)
36
+
37
+ ```bash
38
+ pip install git+https://github.com/SWivid/F5-TTS.git
39
+ ```
40
+
41
+ ### 2. Local editable (if also do training, finetuning)
42
+
43
+ ```bash
44
+ git clone https://github.com/SWivid/F5-TTS.git
45
+ cd F5-TTS
46
+ # git submodule update --init --recursive # (optional, if need bigvgan)
47
+ pip install -e .
48
+ ```
49
+ If initialize submodule, you should add the following code at the beginning of `src/third_party/BigVGAN/bigvgan.py`.
50
+ ```python
51
+ import os
52
+ import sys
53
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
54
+ ```
55
+
56
+ ### 3. Docker usage
57
+ ```bash
58
+ # Build from Dockerfile
59
+ docker build -t f5tts:v1 .
60
+
61
+ # Or pull from GitHub Container Registry
62
+ docker pull ghcr.io/swivid/f5-tts:main
63
+ ```
64
+
65
+
66
+ ## Inference
67
+
68
+ ### 1. Gradio App
69
+
70
+ Currently supported features:
71
+
72
+ - Basic TTS with Chunk Inference
73
+ - Multi-Style / Multi-Speaker Generation
74
+ - Voice Chat powered by Qwen2.5-3B-Instruct
75
+
76
+ ```bash
77
+ # Launch a Gradio app (web interface)
78
+ f5-tts_infer-gradio
79
+
80
+ # Specify the port/host
81
+ f5-tts_infer-gradio --port 7860 --host 0.0.0.0
82
+
83
+ # Launch a share link
84
+ f5-tts_infer-gradio --share
85
+ ```
86
+
87
+ ### 2. CLI Inference
88
+
89
+ ```bash
90
+ # Run with flags
91
+ # Leave --ref_text "" will have ASR model transcribe (extra GPU memory usage)
92
+ f5-tts_infer-cli \
93
+ --model "F5-TTS" \
94
+ --ref_audio "ref_audio.wav" \
95
+ --ref_text "The content, subtitle or transcription of reference audio." \
96
+ --gen_text "Some text you want TTS model generate for you."
97
+
98
+ # Run with default setting. src/f5_tts/infer/examples/basic/basic.toml
99
+ f5-tts_infer-cli
100
+ # Or with your own .toml file
101
+ f5-tts_infer-cli -c custom.toml
102
+
103
+ # Multi voice. See src/f5_tts/infer/README.md
104
+ f5-tts_infer-cli -c src/f5_tts/infer/examples/multi/story.toml
105
+ ```
106
+
107
+ ### 3. More instructions
108
+
109
+ - In order to have better generation results, take a moment to read [detailed guidance](src/f5_tts/infer).
110
+ - The [Issues](https://github.com/SWivid/F5-TTS/issues?q=is%3Aissue) are very useful, please try to find the solution by properly searching the keywords of problem encountered. If no answer found, then feel free to open an issue.
111
+
112
+
113
+ ## Training
114
+
115
+ ### 1. Gradio App
116
+
117
+ Read [training & finetuning guidance](src/f5_tts/train) for more instructions.
118
+
119
+ ```bash
120
+ # Quick start with Gradio web interface
121
+ f5-tts_finetune-gradio
122
+ ```
123
+
124
+
125
+ ## [Evaluation](src/f5_tts/eval)
126
+
127
+
128
+ ## Development
129
+
130
+ Use pre-commit to ensure code quality (will run linters and formatters automatically)
131
+
132
+ ```bash
133
+ pip install pre-commit
134
+ pre-commit install
135
+ ```
136
+
137
+ When making a pull request, before each commit, run:
138
+
139
+ ```bash
140
+ pre-commit run --all-files
141
+ ```
142
+
143
+ Note: Some model components have linting exceptions for E722 to accommodate tensor notation
144
+
145
+
146
+ ## Acknowledgements
147
+
148
+ - [E2-TTS](https://arxiv.org/abs/2406.18009) brilliant work, simple and effective
149
+ - [Emilia](https://arxiv.org/abs/2407.05361), [WenetSpeech4TTS](https://arxiv.org/abs/2406.05763) valuable datasets
150
+ - [lucidrains](https://github.com/lucidrains) initial CFM structure with also [bfs18](https://github.com/bfs18) for discussion
151
+ - [SD3](https://arxiv.org/abs/2403.03206) & [Hugging Face diffusers](https://github.com/huggingface/diffusers) DiT and MMDiT code structure
152
+ - [torchdiffeq](https://github.com/rtqichen/torchdiffeq) as ODE solver, [Vocos](https://huggingface.co/charactr/vocos-mel-24khz) as vocoder
153
+ - [FunASR](https://github.com/modelscope/FunASR), [faster-whisper](https://github.com/SYSTRAN/faster-whisper), [UniSpeech](https://github.com/microsoft/UniSpeech) for evaluation tools
154
+ - [ctc-forced-aligner](https://github.com/MahmoudAshraf97/ctc-forced-aligner) for speech edit test
155
+ - [mrfakename](https://x.com/realmrfakename) huggingface space demo ~
156
+ - [f5-tts-mlx](https://github.com/lucasnewman/f5-tts-mlx/tree/main) Implementation with MLX framework by [Lucas Newman](https://github.com/lucasnewman)
157
+ - [F5-TTS-ONNX](https://github.com/DakeQQ/F5-TTS-ONNX) ONNX Runtime version by [DakeQQ](https://github.com/DakeQQ)
158
+
159
+ ## Citation
160
+ If our work and codebase is useful for you, please cite as:
161
+ ```
162
+ @article{chen-etal-2024-f5tts,
163
+ title={F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching},
164
+ author={Yushen Chen and Zhikang Niu and Ziyang Ma and Keqi Deng and Chunhui Wang and Jian Zhao and Kai Yu and Xie Chen},
165
+ journal={arXiv preprint arXiv:2410.06885},
166
+ year={2024},
167
+ }
168
+ ```
169
+ ## License
170
+
171
+ Our code is released under MIT License. The pre-trained models are licensed under the CC-BY-NC license due to the training data Emilia, which is an in-the-wild dataset. Sorry for any inconvenience this may cause.
pyproject.toml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools >= 61.0", "setuptools-scm>=8.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "f5-tts"
7
+ dynamic = ["version"]
8
+ description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
+ readme = "README.md"
10
+ license = {text = "MIT License"}
11
+ classifiers = [
12
+ "License :: OSI Approved :: MIT License",
13
+ "Operating System :: OS Independent",
14
+ "Programming Language :: Python :: 3",
15
+ ]
16
+ dependencies = [
17
+ "accelerate>=0.33.0",
18
+ "bitsandbytes>0.37.0",
19
+ "cached_path",
20
+ "click",
21
+ "datasets",
22
+ "ema_pytorch>=0.5.2",
23
+ "gradio>=3.45.2",
24
+ "jieba",
25
+ "librosa",
26
+ "matplotlib",
27
+ "numpy<=1.26.4",
28
+ "pydub",
29
+ "pypinyin",
30
+ "safetensors",
31
+ "soundfile",
32
+ "tomli",
33
+ "torch>=2.0.0",
34
+ "torchaudio>=2.0.0",
35
+ "torchdiffeq",
36
+ "tqdm>=4.65.0",
37
+ "transformers",
38
+ "transformers_stream_generator",
39
+ "vocos",
40
+ "wandb",
41
+ "x_transformers>=1.31.14",
42
+ ]
43
+
44
+ [project.optional-dependencies]
45
+ eval = [
46
+ "faster_whisper==0.10.1",
47
+ "funasr",
48
+ "jiwer",
49
+ "modelscope",
50
+ "zhconv",
51
+ "zhon",
52
+ ]
53
+
54
+ [project.urls]
55
+ Homepage = "https://github.com/SWivid/F5-TTS"
56
+
57
+ [project.scripts]
58
+ "f5-tts_infer-cli" = "f5_tts.infer.infer_cli:main"
59
+ "f5-tts_infer-gradio" = "f5_tts.infer.infer_gradio:main"
60
+ "f5-tts_finetune-cli" = "f5_tts.train.finetune_cli:main"
61
+ "f5-tts_finetune-gradio" = "f5_tts.train.finetune_gradio:main"
ruff.toml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ line-length = 120
2
+ target-version = "py310"
3
+
4
+ [lint]
5
+ # Only ignore variables with names starting with "_".
6
+ dummy-variable-rgx = "^_.*$"
7
+
8
+ [lint.isort]
9
+ force-single-line = true
10
+ lines-after-imports = 2