Update
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +16 -34
- .gitignore +3 -0
- Experiments/TTS +1 -0
- Experiments/nohup.out +3 -0
- Experiments/run/events.out.tfevents.1706367627.edresson-train-80.45395.0 +3 -0
- Experiments/run/events.out.tfevents.1706367849.edresson-train-80.46052.0 +3 -0
- Experiments/run/events.out.tfevents.1706367954.edresson-train-80.46941.0 +3 -0
- Experiments/run/events.out.tfevents.1706446227.edresson-train-80.140666.0 +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model_124752.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_130000.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_135000.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/config.json +496 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json +15 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/train_syntacc_baseline.py +352 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/trainer_0_log.txt +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model_85001.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/config.json +496 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json +15 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/train_syntacc_baseline.py +352 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/trainer_0_log.txt +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model_87192.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_130000.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_135000.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/config.json +496 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json +15 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth +3 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/train_syntacc_baseline.py +352 -0
- Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/trainer_0_log.txt +3 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/best_model.pth +3 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/best_model_78415.pth +3 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_80000.pth +3 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth +3 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/config.json +496 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/language_ids.json +15 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/speakers.pth +3 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/train_syntacc.py +352 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/trainer_0_log.txt +3 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/best_model.pth +3 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/best_model_87818.pth +3 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/config.json +496 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/language_ids.json +15 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/speakers.pth +3 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/train_syntacc.py +352 -0
- Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/trainer_0_log.txt +3 -0
- Experiments/train_syntacc.py +352 -0
.gitattributes
CHANGED
@@ -1,35 +1,17 @@
|
|
1 |
-
*.
|
2 |
-
*.
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.txt filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.t7 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.out filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.0 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.csv filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.o filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.so filter=lfs diff=lfs merge=lfs -text
|
12 |
+
HierSpeech_TTS/denoiser/g_best filter=lfs diff=lfs merge=lfs -text
|
13 |
+
g_best filter=lfs diff=lfs merge=lfs -text
|
14 |
+
TTS-private/nohup.out filter=lfs diff=lfs merge=lfs -text
|
15 |
+
nohup.out filter=lfs diff=lfs merge=lfs -text
|
16 |
+
TTS-private/run/events.out.tfevents.1705084461.edresson-train-80-2.93786.0 filter=lfs diff=lfs merge=lfs -text
|
17 |
+
events.out.tfevents.1705084461.edresson-train-80-2.93786.0 filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
*/.git/*
|
2 |
+
.git
|
3 |
+
.git/*
|
Experiments/TTS
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit a45dfd62668cd5778dd6a384308097ba0370c034
|
Experiments/nohup.out
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b67f0b1dbf0a04937b9b6db1e55de4cc6057c5f1832ebb1a8c4e3c2f4b5a9e6
|
3 |
+
size 19940074
|
Experiments/run/events.out.tfevents.1706367627.edresson-train-80.45395.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa9dfc334b721e0bc90371cdf519b1bb244a637f4c56ce0ef949f76b5848ee8d
|
3 |
+
size 347255243
|
Experiments/run/events.out.tfevents.1706367849.edresson-train-80.46052.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90283b4420a0282755e4425857d26cf58a98b7229057b9e6a5aca19014168184
|
3 |
+
size 1238111
|
Experiments/run/events.out.tfevents.1706367954.edresson-train-80.46941.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4fb96459ffb6c55b7e51e5ee3a13a2f265f2c1579a78756e789583906320e81e
|
3 |
+
size 350277161
|
Experiments/run/events.out.tfevents.1706446227.edresson-train-80.140666.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:15518ce7cf33a4ff76601734d638436b43895ab61e6b4efe25b5a24b495f529c
|
3 |
+
size 21123264
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c62e29c7a1dd4f701ab4998e0b1f569cfe7486cc7806f149c1ff857f172383e0
|
3 |
+
size 1043220702
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model_124752.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c62e29c7a1dd4f701ab4998e0b1f569cfe7486cc7806f149c1ff857f172383e0
|
3 |
+
size 1043220702
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_130000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a71ead47e605fc525b264ad882fd54630c15a42eb69aaf88993d26d5ea84ae3b
|
3 |
+
size 1043220766
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_135000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:96e16ee83729813041c17f6edf8a702bdf59e7afe345cfad1fe65dd4ba0b1fce
|
3 |
+
size 1043220766
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/config.json
ADDED
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"output_path": "/raid/datasets/MUPE/Experiments/runs",
|
3 |
+
"logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
|
4 |
+
"run_name": "YourTTS-Baseline-PT",
|
5 |
+
"project_name": "SYNTACC",
|
6 |
+
"run_description": "\n - YourTTS with SYNTACC text encoder\n ",
|
7 |
+
"print_step": 50,
|
8 |
+
"plot_step": 100,
|
9 |
+
"model_param_stats": false,
|
10 |
+
"wandb_entity": null,
|
11 |
+
"dashboard_logger": "clearml",
|
12 |
+
"save_on_interrupt": true,
|
13 |
+
"log_model_step": 1000,
|
14 |
+
"save_step": 5000,
|
15 |
+
"save_n_checkpoints": 2,
|
16 |
+
"save_checkpoints": true,
|
17 |
+
"save_all_best": false,
|
18 |
+
"save_best_after": 10000,
|
19 |
+
"target_loss": null,
|
20 |
+
"print_eval": false,
|
21 |
+
"test_delay_epochs": 0,
|
22 |
+
"run_eval": true,
|
23 |
+
"run_eval_steps": null,
|
24 |
+
"distributed_backend": "nccl",
|
25 |
+
"distributed_url": "tcp://localhost:54321",
|
26 |
+
"mixed_precision": false,
|
27 |
+
"precision": "fp16",
|
28 |
+
"epochs": 1000,
|
29 |
+
"batch_size": 26,
|
30 |
+
"eval_batch_size": 26,
|
31 |
+
"grad_clip": [
|
32 |
+
1000,
|
33 |
+
1000
|
34 |
+
],
|
35 |
+
"scheduler_after_epoch": true,
|
36 |
+
"lr": 0.001,
|
37 |
+
"optimizer": "AdamW",
|
38 |
+
"optimizer_params": {
|
39 |
+
"betas": [
|
40 |
+
0.8,
|
41 |
+
0.99
|
42 |
+
],
|
43 |
+
"eps": 1e-09,
|
44 |
+
"weight_decay": 0.01
|
45 |
+
},
|
46 |
+
"lr_scheduler": null,
|
47 |
+
"lr_scheduler_params": {},
|
48 |
+
"use_grad_scaler": false,
|
49 |
+
"allow_tf32": false,
|
50 |
+
"cudnn_enable": true,
|
51 |
+
"cudnn_deterministic": false,
|
52 |
+
"cudnn_benchmark": false,
|
53 |
+
"training_seed": 54321,
|
54 |
+
"model": "vits",
|
55 |
+
"num_loader_workers": 8,
|
56 |
+
"num_eval_loader_workers": 0,
|
57 |
+
"use_noise_augment": false,
|
58 |
+
"audio": {
|
59 |
+
"fft_size": 1024,
|
60 |
+
"sample_rate": 16000,
|
61 |
+
"win_length": 1024,
|
62 |
+
"hop_length": 256,
|
63 |
+
"num_mels": 80,
|
64 |
+
"mel_fmin": 0.0,
|
65 |
+
"mel_fmax": null
|
66 |
+
},
|
67 |
+
"use_phonemes": false,
|
68 |
+
"phonemizer": "espeak",
|
69 |
+
"phoneme_language": "en",
|
70 |
+
"compute_input_seq_cache": true,
|
71 |
+
"text_cleaner": "multilingual_cleaners",
|
72 |
+
"enable_eos_bos_chars": false,
|
73 |
+
"test_sentences_file": "",
|
74 |
+
"phoneme_cache_path": null,
|
75 |
+
"characters": {
|
76 |
+
"characters_class": "TTS.tts.models.vits.VitsCharacters",
|
77 |
+
"vocab_dict": null,
|
78 |
+
"pad": "_",
|
79 |
+
"eos": "&",
|
80 |
+
"bos": "*",
|
81 |
+
"blank": null,
|
82 |
+
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
83 |
+
"punctuations": "\u2014!'(),-.:;?\u00bf ",
|
84 |
+
"phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
85 |
+
"is_unique": true,
|
86 |
+
"is_sorted": true
|
87 |
+
},
|
88 |
+
"add_blank": true,
|
89 |
+
"batch_group_size": 48,
|
90 |
+
"loss_masking": null,
|
91 |
+
"min_audio_len": 1,
|
92 |
+
"max_audio_len": Infinity,
|
93 |
+
"min_text_len": 1,
|
94 |
+
"max_text_len": Infinity,
|
95 |
+
"compute_f0": false,
|
96 |
+
"compute_energy": false,
|
97 |
+
"compute_linear_spec": true,
|
98 |
+
"precompute_num_workers": 12,
|
99 |
+
"start_by_longest": true,
|
100 |
+
"shuffle": false,
|
101 |
+
"drop_last": false,
|
102 |
+
"datasets": [
|
103 |
+
{
|
104 |
+
"formatter": "coqui",
|
105 |
+
"dataset_name": "mupe",
|
106 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
107 |
+
"meta_file_train": "metadata_coqui_brpb.csv",
|
108 |
+
"ignored_speakers": null,
|
109 |
+
"language": "brpb",
|
110 |
+
"phonemizer": "",
|
111 |
+
"meta_file_val": "",
|
112 |
+
"meta_file_attn_mask": ""
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"formatter": "coqui",
|
116 |
+
"dataset_name": "mupe",
|
117 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
118 |
+
"meta_file_train": "metadata_coqui_brba.csv",
|
119 |
+
"ignored_speakers": null,
|
120 |
+
"language": "brba",
|
121 |
+
"phonemizer": "",
|
122 |
+
"meta_file_val": "",
|
123 |
+
"meta_file_attn_mask": ""
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"formatter": "coqui",
|
127 |
+
"dataset_name": "mupe",
|
128 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
129 |
+
"meta_file_train": "metadata_coqui_brportugal.csv",
|
130 |
+
"ignored_speakers": null,
|
131 |
+
"language": "brportugal",
|
132 |
+
"phonemizer": "",
|
133 |
+
"meta_file_val": "",
|
134 |
+
"meta_file_attn_mask": ""
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"formatter": "coqui",
|
138 |
+
"dataset_name": "mupe",
|
139 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
140 |
+
"meta_file_train": "metadata_coqui_brsp.csv",
|
141 |
+
"ignored_speakers": null,
|
142 |
+
"language": "brsp",
|
143 |
+
"phonemizer": "",
|
144 |
+
"meta_file_val": "",
|
145 |
+
"meta_file_attn_mask": ""
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"formatter": "coqui",
|
149 |
+
"dataset_name": "mupe",
|
150 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
151 |
+
"meta_file_train": "metadata_coqui_brpe.csv",
|
152 |
+
"ignored_speakers": null,
|
153 |
+
"language": "brpe",
|
154 |
+
"phonemizer": "",
|
155 |
+
"meta_file_val": "",
|
156 |
+
"meta_file_attn_mask": ""
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"formatter": "coqui",
|
160 |
+
"dataset_name": "mupe",
|
161 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
162 |
+
"meta_file_train": "metadata_coqui_brmg.csv",
|
163 |
+
"ignored_speakers": null,
|
164 |
+
"language": "brmg",
|
165 |
+
"phonemizer": "",
|
166 |
+
"meta_file_val": "",
|
167 |
+
"meta_file_attn_mask": ""
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"formatter": "coqui",
|
171 |
+
"dataset_name": "mupe",
|
172 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
173 |
+
"meta_file_train": "metadata_coqui_brrj.csv",
|
174 |
+
"ignored_speakers": null,
|
175 |
+
"language": "brrj",
|
176 |
+
"phonemizer": "",
|
177 |
+
"meta_file_val": "",
|
178 |
+
"meta_file_attn_mask": ""
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"formatter": "coqui",
|
182 |
+
"dataset_name": "mupe",
|
183 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
184 |
+
"meta_file_train": "metadata_coqui_brce.csv",
|
185 |
+
"ignored_speakers": null,
|
186 |
+
"language": "brce",
|
187 |
+
"phonemizer": "",
|
188 |
+
"meta_file_val": "",
|
189 |
+
"meta_file_attn_mask": ""
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"formatter": "coqui",
|
193 |
+
"dataset_name": "mupe",
|
194 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
195 |
+
"meta_file_train": "metadata_coqui_brrs.csv",
|
196 |
+
"ignored_speakers": null,
|
197 |
+
"language": "brrs",
|
198 |
+
"phonemizer": "",
|
199 |
+
"meta_file_val": "",
|
200 |
+
"meta_file_attn_mask": ""
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"formatter": "coqui",
|
204 |
+
"dataset_name": "mupe",
|
205 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
206 |
+
"meta_file_train": "metadata_coqui_bralemanha.csv",
|
207 |
+
"ignored_speakers": null,
|
208 |
+
"language": "bralemanha",
|
209 |
+
"phonemizer": "",
|
210 |
+
"meta_file_val": "",
|
211 |
+
"meta_file_attn_mask": ""
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"formatter": "coqui",
|
215 |
+
"dataset_name": "mupe",
|
216 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
217 |
+
"meta_file_train": "metadata_coqui_brgo.csv",
|
218 |
+
"ignored_speakers": null,
|
219 |
+
"language": "brgo",
|
220 |
+
"phonemizer": "",
|
221 |
+
"meta_file_val": "",
|
222 |
+
"meta_file_attn_mask": ""
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"formatter": "coqui",
|
226 |
+
"dataset_name": "mupe",
|
227 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
228 |
+
"meta_file_train": "metadata_coqui_bral.csv",
|
229 |
+
"ignored_speakers": null,
|
230 |
+
"language": "bral",
|
231 |
+
"phonemizer": "",
|
232 |
+
"meta_file_val": "",
|
233 |
+
"meta_file_attn_mask": ""
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"formatter": "coqui",
|
237 |
+
"dataset_name": "mupe",
|
238 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
239 |
+
"meta_file_train": "metadata_coqui_brpr.csv",
|
240 |
+
"ignored_speakers": null,
|
241 |
+
"language": "brpr",
|
242 |
+
"phonemizer": "",
|
243 |
+
"meta_file_val": "",
|
244 |
+
"meta_file_attn_mask": ""
|
245 |
+
}
|
246 |
+
],
|
247 |
+
"test_sentences": [
|
248 |
+
[
|
249 |
+
"Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
|
250 |
+
"EDILEINE_FONSECA",
|
251 |
+
null,
|
252 |
+
"brsp"
|
253 |
+
],
|
254 |
+
[
|
255 |
+
"Quem semeia ventos, colhe tempestades.",
|
256 |
+
"JOSE_PAULO_DE_ARAUJO",
|
257 |
+
null,
|
258 |
+
"brpb"
|
259 |
+
],
|
260 |
+
[
|
261 |
+
"O olho do dono \u00e9 que engorda o gado.",
|
262 |
+
"VITOR_RAFAEL_OLIVEIRA_ALVES",
|
263 |
+
null,
|
264 |
+
"brba"
|
265 |
+
],
|
266 |
+
[
|
267 |
+
"\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
|
268 |
+
"MARIA_AURORA_FELIX",
|
269 |
+
null,
|
270 |
+
"brportugal"
|
271 |
+
],
|
272 |
+
[
|
273 |
+
"Quem espera sempre alcan\u00e7a.",
|
274 |
+
"ANTONIO_DE_AMORIM_COSTA",
|
275 |
+
null,
|
276 |
+
"brpe"
|
277 |
+
],
|
278 |
+
[
|
279 |
+
"Cada macaco no seu galho.",
|
280 |
+
"ALCIDES_DE_LIMA",
|
281 |
+
null,
|
282 |
+
"brmg"
|
283 |
+
],
|
284 |
+
[
|
285 |
+
"Em terra de cego, quem tem um olho \u00e9 rei.",
|
286 |
+
"ALUISIO_SOARES_DE_SOUSA",
|
287 |
+
null,
|
288 |
+
"brrj"
|
289 |
+
],
|
290 |
+
[
|
291 |
+
"A ocasi\u00e3o faz o ladr\u00e3o.",
|
292 |
+
"FRANCISCO_JOSE_MOREIRA_MOTA",
|
293 |
+
null,
|
294 |
+
"brce"
|
295 |
+
],
|
296 |
+
[
|
297 |
+
"De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
|
298 |
+
"EVALDO_ANDRADA_CORREA",
|
299 |
+
null,
|
300 |
+
"brrs"
|
301 |
+
],
|
302 |
+
[
|
303 |
+
"Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
|
304 |
+
"DORIS_ALEXANDER",
|
305 |
+
null,
|
306 |
+
"bralemanha"
|
307 |
+
],
|
308 |
+
[
|
309 |
+
"Quem n\u00e3o arrisca, n\u00e3o petisca.",
|
310 |
+
"DONALDO_LUIZ_DE_ALMEIDA",
|
311 |
+
null,
|
312 |
+
"brgo"
|
313 |
+
],
|
314 |
+
[
|
315 |
+
"A uni\u00e3o faz a for\u00e7a.",
|
316 |
+
"GERONCIO_HENRIQUE_NETO",
|
317 |
+
null,
|
318 |
+
"bral"
|
319 |
+
],
|
320 |
+
[
|
321 |
+
"Em boca fechada n\u00e3o entra mosquito.",
|
322 |
+
"MALU_NATEL_FREIRE_WEBER",
|
323 |
+
null,
|
324 |
+
"brpr"
|
325 |
+
]
|
326 |
+
],
|
327 |
+
"eval_split_max_size": 256,
|
328 |
+
"eval_split_size": 0.01,
|
329 |
+
"use_speaker_weighted_sampler": false,
|
330 |
+
"speaker_weighted_sampler_alpha": 1.0,
|
331 |
+
"use_language_weighted_sampler": false,
|
332 |
+
"language_weighted_sampler_alpha": 1.0,
|
333 |
+
"use_length_weighted_sampler": false,
|
334 |
+
"length_weighted_sampler_alpha": 1.0,
|
335 |
+
"model_args": {
|
336 |
+
"num_chars": 266,
|
337 |
+
"out_channels": 513,
|
338 |
+
"spec_segment_size": 62,
|
339 |
+
"hidden_channels": 192,
|
340 |
+
"use_adaptive_weight_text_encoder": false,
|
341 |
+
"use_perfect_class_batch_sampler": true,
|
342 |
+
"perfect_class_batch_sampler_key": "language",
|
343 |
+
"hidden_channels_ffn_text_encoder": 768,
|
344 |
+
"num_heads_text_encoder": 2,
|
345 |
+
"num_layers_text_encoder": 10,
|
346 |
+
"kernel_size_text_encoder": 3,
|
347 |
+
"dropout_p_text_encoder": 0.1,
|
348 |
+
"dropout_p_duration_predictor": 0.5,
|
349 |
+
"kernel_size_posterior_encoder": 5,
|
350 |
+
"dilation_rate_posterior_encoder": 1,
|
351 |
+
"num_layers_posterior_encoder": 16,
|
352 |
+
"kernel_size_flow": 5,
|
353 |
+
"dilation_rate_flow": 1,
|
354 |
+
"num_layers_flow": 4,
|
355 |
+
"resblock_type_decoder": "2",
|
356 |
+
"resblock_kernel_sizes_decoder": [
|
357 |
+
3,
|
358 |
+
7,
|
359 |
+
11
|
360 |
+
],
|
361 |
+
"resblock_dilation_sizes_decoder": [
|
362 |
+
[
|
363 |
+
1,
|
364 |
+
3,
|
365 |
+
5
|
366 |
+
],
|
367 |
+
[
|
368 |
+
1,
|
369 |
+
3,
|
370 |
+
5
|
371 |
+
],
|
372 |
+
[
|
373 |
+
1,
|
374 |
+
3,
|
375 |
+
5
|
376 |
+
]
|
377 |
+
],
|
378 |
+
"upsample_rates_decoder": [
|
379 |
+
8,
|
380 |
+
8,
|
381 |
+
2,
|
382 |
+
2
|
383 |
+
],
|
384 |
+
"upsample_initial_channel_decoder": 512,
|
385 |
+
"upsample_kernel_sizes_decoder": [
|
386 |
+
16,
|
387 |
+
16,
|
388 |
+
4,
|
389 |
+
4
|
390 |
+
],
|
391 |
+
"periods_multi_period_discriminator": [
|
392 |
+
2,
|
393 |
+
3,
|
394 |
+
5,
|
395 |
+
7,
|
396 |
+
11
|
397 |
+
],
|
398 |
+
"use_sdp": true,
|
399 |
+
"noise_scale": 1.0,
|
400 |
+
"inference_noise_scale": 0.667,
|
401 |
+
"length_scale": 1,
|
402 |
+
"noise_scale_dp": 1.0,
|
403 |
+
"inference_noise_scale_dp": 1.0,
|
404 |
+
"max_inference_len": null,
|
405 |
+
"init_discriminator": true,
|
406 |
+
"use_spectral_norm_disriminator": false,
|
407 |
+
"use_speaker_embedding": false,
|
408 |
+
"num_speakers": 0,
|
409 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth",
|
410 |
+
"d_vector_file": [
|
411 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
412 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
413 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
414 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
415 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
416 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
417 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
418 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
419 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
420 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
421 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
422 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
423 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
424 |
+
],
|
425 |
+
"speaker_embedding_channels": 256,
|
426 |
+
"use_d_vector_file": true,
|
427 |
+
"d_vector_dim": 512,
|
428 |
+
"detach_dp_input": true,
|
429 |
+
"use_language_embedding": true,
|
430 |
+
"embedded_language_dim": 4,
|
431 |
+
"num_languages": 0,
|
432 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json",
|
433 |
+
"use_speaker_encoder_as_loss": false,
|
434 |
+
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
435 |
+
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
436 |
+
"condition_dp_on_speaker": true,
|
437 |
+
"freeze_encoder": false,
|
438 |
+
"freeze_DP": false,
|
439 |
+
"freeze_PE": false,
|
440 |
+
"freeze_flow_decoder": false,
|
441 |
+
"freeze_waveform_decoder": false,
|
442 |
+
"encoder_sample_rate": null,
|
443 |
+
"interpolate_z": true,
|
444 |
+
"reinit_DP": false,
|
445 |
+
"reinit_text_encoder": false
|
446 |
+
},
|
447 |
+
"lr_gen": 0.0002,
|
448 |
+
"lr_disc": 0.0002,
|
449 |
+
"lr_scheduler_gen": "ExponentialLR",
|
450 |
+
"lr_scheduler_gen_params": {
|
451 |
+
"gamma": 0.999875,
|
452 |
+
"last_epoch": -1
|
453 |
+
},
|
454 |
+
"lr_scheduler_disc": "ExponentialLR",
|
455 |
+
"lr_scheduler_disc_params": {
|
456 |
+
"gamma": 0.999875,
|
457 |
+
"last_epoch": -1
|
458 |
+
},
|
459 |
+
"kl_loss_alpha": 1.0,
|
460 |
+
"disc_loss_alpha": 1.0,
|
461 |
+
"gen_loss_alpha": 1.0,
|
462 |
+
"feat_loss_alpha": 1.0,
|
463 |
+
"mel_loss_alpha": 45.0,
|
464 |
+
"dur_loss_alpha": 1.0,
|
465 |
+
"speaker_encoder_loss_alpha": 9.0,
|
466 |
+
"return_wav": true,
|
467 |
+
"use_weighted_sampler": true,
|
468 |
+
"weighted_sampler_attrs": {
|
469 |
+
"language": 1.0
|
470 |
+
},
|
471 |
+
"weighted_sampler_multipliers": {},
|
472 |
+
"r": 1,
|
473 |
+
"num_speakers": 0,
|
474 |
+
"use_speaker_embedding": false,
|
475 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth",
|
476 |
+
"speaker_embedding_channels": 256,
|
477 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json",
|
478 |
+
"use_language_embedding": true,
|
479 |
+
"use_d_vector_file": true,
|
480 |
+
"d_vector_file": [
|
481 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
482 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
483 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
484 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
485 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
486 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
487 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
488 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
489 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
490 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
491 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
492 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
493 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
494 |
+
],
|
495 |
+
"d_vector_dim": 512
|
496 |
+
}
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bral": 0,
|
3 |
+
"bralemanha": 1,
|
4 |
+
"brba": 2,
|
5 |
+
"brce": 3,
|
6 |
+
"brgo": 4,
|
7 |
+
"brmg": 5,
|
8 |
+
"brpb": 6,
|
9 |
+
"brpe": 7,
|
10 |
+
"brportugal": 8,
|
11 |
+
"brpr": 9,
|
12 |
+
"brrj": 10,
|
13 |
+
"brrs": 11,
|
14 |
+
"brsp": 12
|
15 |
+
}
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
|
3 |
+
size 3296
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/train_syntacc_baseline.py
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from trainer import Trainer, TrainerArgs
|
5 |
+
|
6 |
+
from TTS.bin.compute_embeddings import compute_embeddings
|
7 |
+
from TTS.bin.resample import resample_files
|
8 |
+
from TTS.config.shared_configs import BaseDatasetConfig
|
9 |
+
from TTS.tts.configs.vits_config import VitsConfig
|
10 |
+
from TTS.tts.datasets import load_tts_samples
|
11 |
+
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
|
12 |
+
from TTS.utils.downloaders import download_libri_tts
|
13 |
+
from torch.utils.data import DataLoader
|
14 |
+
from TTS.utils.samplers import PerfectBatchSampler
|
15 |
+
torch.set_num_threads(24)
|
16 |
+
|
17 |
+
# pylint: disable=W0105
|
18 |
+
"""
|
19 |
+
This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
|
20 |
+
YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
|
21 |
+
"""
|
22 |
+
CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
|
23 |
+
|
24 |
+
# Name of the run for the Trainer
|
25 |
+
RUN_NAME = "YourTTS-Baseline-PT"
|
26 |
+
|
27 |
+
# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
|
28 |
+
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
29 |
+
|
30 |
+
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
31 |
+
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
32 |
+
|
33 |
+
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
34 |
+
SKIP_TRAIN_EPOCH = False
|
35 |
+
|
36 |
+
# Set here the batch size to be used in training and evaluation
|
37 |
+
BATCH_SIZE = 26
|
38 |
+
|
39 |
+
# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
|
40 |
+
# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
|
41 |
+
SAMPLE_RATE = 16000
|
42 |
+
|
43 |
+
|
44 |
+
DASHBOARD_LOGGER="tensorboard"
|
45 |
+
LOGGER_URI = None
|
46 |
+
|
47 |
+
DASHBOARD_LOGGER = "clearml"
|
48 |
+
LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
|
53 |
+
MAX_AUDIO_LEN_IN_SECONDS = float("inf")
|
54 |
+
|
55 |
+
# Define here the datasets config
|
56 |
+
brpb_train_config = BaseDatasetConfig(
|
57 |
+
formatter="coqui",
|
58 |
+
dataset_name="mupe",
|
59 |
+
meta_file_train="metadata_coqui_brpb.csv",
|
60 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
61 |
+
language="brpb"
|
62 |
+
)
|
63 |
+
|
64 |
+
brba_train_config = BaseDatasetConfig(
|
65 |
+
formatter="coqui",
|
66 |
+
dataset_name="mupe",
|
67 |
+
meta_file_train="metadata_coqui_brba.csv",
|
68 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
69 |
+
language="brba"
|
70 |
+
)
|
71 |
+
|
72 |
+
brportugal_train_config = BaseDatasetConfig(
|
73 |
+
formatter="coqui",
|
74 |
+
dataset_name="mupe",
|
75 |
+
meta_file_train="metadata_coqui_brportugal.csv",
|
76 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
77 |
+
language="brportugal"
|
78 |
+
)
|
79 |
+
|
80 |
+
brsp_train_config = BaseDatasetConfig(
|
81 |
+
formatter="coqui",
|
82 |
+
dataset_name="mupe",
|
83 |
+
meta_file_train="metadata_coqui_brsp.csv",
|
84 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
85 |
+
language="brsp"
|
86 |
+
)
|
87 |
+
|
88 |
+
brpe_train_config = BaseDatasetConfig(
|
89 |
+
formatter="coqui",
|
90 |
+
dataset_name="mupe",
|
91 |
+
meta_file_train="metadata_coqui_brpe.csv",
|
92 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
93 |
+
language="brpe"
|
94 |
+
)
|
95 |
+
|
96 |
+
brmg_train_config = BaseDatasetConfig(
|
97 |
+
formatter="coqui",
|
98 |
+
dataset_name="mupe",
|
99 |
+
meta_file_train="metadata_coqui_brmg.csv",
|
100 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
101 |
+
language="brmg"
|
102 |
+
)
|
103 |
+
|
104 |
+
brrj_train_config = BaseDatasetConfig(
|
105 |
+
formatter="coqui",
|
106 |
+
dataset_name="mupe",
|
107 |
+
meta_file_train="metadata_coqui_brrj.csv",
|
108 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
109 |
+
language="brrj"
|
110 |
+
)
|
111 |
+
|
112 |
+
brce_train_config = BaseDatasetConfig(
|
113 |
+
formatter="coqui",
|
114 |
+
dataset_name="mupe",
|
115 |
+
meta_file_train="metadata_coqui_brce.csv",
|
116 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
117 |
+
language="brce"
|
118 |
+
)
|
119 |
+
|
120 |
+
brrs_train_config = BaseDatasetConfig(
|
121 |
+
formatter="coqui",
|
122 |
+
dataset_name="mupe",
|
123 |
+
meta_file_train="metadata_coqui_brrs.csv",
|
124 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
125 |
+
language="brrs"
|
126 |
+
)
|
127 |
+
|
128 |
+
bralemanha_train_config = BaseDatasetConfig(
|
129 |
+
formatter="coqui",
|
130 |
+
dataset_name="mupe",
|
131 |
+
meta_file_train="metadata_coqui_bralemanha.csv",
|
132 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
133 |
+
language="bralemanha"
|
134 |
+
)
|
135 |
+
|
136 |
+
brgo_train_config = BaseDatasetConfig(
|
137 |
+
formatter="coqui",
|
138 |
+
dataset_name="mupe",
|
139 |
+
meta_file_train="metadata_coqui_brgo.csv",
|
140 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
141 |
+
language="brgo"
|
142 |
+
)
|
143 |
+
|
144 |
+
bral_train_config = BaseDatasetConfig(
|
145 |
+
formatter="coqui",
|
146 |
+
dataset_name="mupe",
|
147 |
+
meta_file_train="metadata_coqui_bral.csv",
|
148 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
149 |
+
language="bral"
|
150 |
+
)
|
151 |
+
|
152 |
+
brpr_train_config = BaseDatasetConfig(
|
153 |
+
formatter="coqui",
|
154 |
+
dataset_name="mupe",
|
155 |
+
meta_file_train="metadata_coqui_brpr.csv",
|
156 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
157 |
+
language="brpr"
|
158 |
+
)
|
159 |
+
|
160 |
+
bres_train_config = BaseDatasetConfig(
|
161 |
+
formatter="coqui",
|
162 |
+
dataset_name="mupe",
|
163 |
+
meta_file_train="metadata_coqui_bres.csv",
|
164 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
165 |
+
language="bres"
|
166 |
+
)
|
167 |
+
|
168 |
+
brpi_train_config = BaseDatasetConfig(
|
169 |
+
formatter="coqui",
|
170 |
+
dataset_name="mupe",
|
171 |
+
meta_file_train="metadata_coqui_brpi.csv",
|
172 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
173 |
+
language="brpi"
|
174 |
+
)
|
175 |
+
|
176 |
+
# bres_train_config, brpi_train_config no files found
|
177 |
+
DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
|
178 |
+
|
179 |
+
|
180 |
+
### Extract speaker embeddings
|
181 |
+
SPEAKER_ENCODER_CHECKPOINT_PATH = (
|
182 |
+
"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
|
183 |
+
)
|
184 |
+
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
|
185 |
+
|
186 |
+
D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
|
187 |
+
|
188 |
+
# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
|
189 |
+
for dataset_conf in DATASETS_CONFIG_LIST:
|
190 |
+
# Check if the embeddings weren't already computed, if not compute it
|
191 |
+
embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
|
192 |
+
if not os.path.isfile(embeddings_file):
|
193 |
+
print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
|
194 |
+
compute_embeddings(
|
195 |
+
SPEAKER_ENCODER_CHECKPOINT_PATH,
|
196 |
+
SPEAKER_ENCODER_CONFIG_PATH,
|
197 |
+
embeddings_file,
|
198 |
+
old_speakers_file=None,
|
199 |
+
config_dataset_path=None,
|
200 |
+
formatter_name=dataset_conf.formatter,
|
201 |
+
dataset_name=dataset_conf.dataset_name,
|
202 |
+
dataset_path=dataset_conf.path,
|
203 |
+
meta_file_train=dataset_conf.meta_file_train,
|
204 |
+
meta_file_val=dataset_conf.meta_file_val,
|
205 |
+
disable_cuda=False,
|
206 |
+
no_eval=False,
|
207 |
+
)
|
208 |
+
D_VECTOR_FILES.append(embeddings_file)
|
209 |
+
|
210 |
+
|
211 |
+
# Audio config used in training.
|
212 |
+
audio_config = VitsAudioConfig(
|
213 |
+
sample_rate=SAMPLE_RATE,
|
214 |
+
hop_length=256,
|
215 |
+
win_length=1024,
|
216 |
+
fft_size=1024,
|
217 |
+
mel_fmin=0.0,
|
218 |
+
mel_fmax=None,
|
219 |
+
num_mels=80,
|
220 |
+
)
|
221 |
+
|
222 |
+
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
223 |
+
model_args = VitsArgs(
|
224 |
+
spec_segment_size=62,
|
225 |
+
hidden_channels=192,
|
226 |
+
hidden_channels_ffn_text_encoder=768,
|
227 |
+
num_heads_text_encoder=2,
|
228 |
+
num_layers_text_encoder=10,
|
229 |
+
kernel_size_text_encoder=3,
|
230 |
+
dropout_p_text_encoder=0.1,
|
231 |
+
d_vector_file=D_VECTOR_FILES,
|
232 |
+
use_d_vector_file=True,
|
233 |
+
d_vector_dim=512,
|
234 |
+
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
235 |
+
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
236 |
+
resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
237 |
+
# Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
|
238 |
+
use_speaker_encoder_as_loss=False,
|
239 |
+
# Useful parameters to enable multilingual training
|
240 |
+
use_language_embedding=True,
|
241 |
+
embedded_language_dim=4,
|
242 |
+
use_adaptive_weight_text_encoder=False,
|
243 |
+
use_perfect_class_batch_sampler=True,
|
244 |
+
perfect_class_batch_sampler_key="language"
|
245 |
+
)
|
246 |
+
|
247 |
+
# General training config, here you can change the batch size and others useful parameters
|
248 |
+
config = VitsConfig(
|
249 |
+
output_path=OUT_PATH,
|
250 |
+
model_args=model_args,
|
251 |
+
run_name=RUN_NAME,
|
252 |
+
project_name="SYNTACC",
|
253 |
+
run_description="""
|
254 |
+
- YourTTS with SYNTACC text encoder
|
255 |
+
""",
|
256 |
+
dashboard_logger=DASHBOARD_LOGGER,
|
257 |
+
logger_uri=LOGGER_URI,
|
258 |
+
audio=audio_config,
|
259 |
+
batch_size=BATCH_SIZE,
|
260 |
+
batch_group_size=48,
|
261 |
+
eval_batch_size=BATCH_SIZE,
|
262 |
+
num_loader_workers=8,
|
263 |
+
eval_split_max_size=256,
|
264 |
+
print_step=50,
|
265 |
+
plot_step=100,
|
266 |
+
log_model_step=1000,
|
267 |
+
save_step=5000,
|
268 |
+
save_n_checkpoints=2,
|
269 |
+
save_checkpoints=True,
|
270 |
+
# target_loss="loss_1",
|
271 |
+
print_eval=False,
|
272 |
+
use_phonemes=False,
|
273 |
+
phonemizer="espeak",
|
274 |
+
phoneme_language="en",
|
275 |
+
compute_input_seq_cache=True,
|
276 |
+
add_blank=True,
|
277 |
+
text_cleaner="multilingual_cleaners",
|
278 |
+
characters=CharactersConfig(
|
279 |
+
characters_class="TTS.tts.models.vits.VitsCharacters",
|
280 |
+
pad="_",
|
281 |
+
eos="&",
|
282 |
+
bos="*",
|
283 |
+
blank=None,
|
284 |
+
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
285 |
+
punctuations="\u2014!'(),-.:;?\u00bf ",
|
286 |
+
phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
287 |
+
is_unique=True,
|
288 |
+
is_sorted=True,
|
289 |
+
),
|
290 |
+
phoneme_cache_path=None,
|
291 |
+
precompute_num_workers=12,
|
292 |
+
start_by_longest=True,
|
293 |
+
datasets=DATASETS_CONFIG_LIST,
|
294 |
+
cudnn_benchmark=False,
|
295 |
+
max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
|
296 |
+
mixed_precision=False,
|
297 |
+
test_sentences=[
|
298 |
+
#GUSTAVO: apenas pessoas do treino
|
299 |
+
["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
|
300 |
+
["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
|
301 |
+
["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
|
302 |
+
["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
|
303 |
+
["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
|
304 |
+
["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
|
305 |
+
["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
|
306 |
+
["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
|
307 |
+
["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
|
308 |
+
["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
|
309 |
+
["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
|
310 |
+
["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
|
311 |
+
["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
|
312 |
+
# ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
|
313 |
+
# ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
|
314 |
+
],
|
315 |
+
# Enable the weighted sampler
|
316 |
+
use_weighted_sampler=True,
|
317 |
+
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
318 |
+
# weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
|
319 |
+
weighted_sampler_attrs={"language": 1.0},
|
320 |
+
weighted_sampler_multipliers={
|
321 |
+
# "speaker_name": {
|
322 |
+
# you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
|
323 |
+
# It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
|
324 |
+
# The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
|
325 |
+
# 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
|
326 |
+
# }
|
327 |
+
},
|
328 |
+
# It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
|
329 |
+
speaker_encoder_loss_alpha=9.0,
|
330 |
+
)
|
331 |
+
|
332 |
+
# Load all the datasets samples and split traning and evaluation sets
|
333 |
+
train_samples, eval_samples = load_tts_samples(
|
334 |
+
config.datasets,
|
335 |
+
eval_split=True,
|
336 |
+
eval_split_max_size=config.eval_split_max_size,
|
337 |
+
eval_split_size=config.eval_split_size,
|
338 |
+
)
|
339 |
+
|
340 |
+
# Init the model
|
341 |
+
model = Vits.init_from_config(config)
|
342 |
+
|
343 |
+
# Init the trainer and 🚀
|
344 |
+
trainer = Trainer(
|
345 |
+
TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
|
346 |
+
config,
|
347 |
+
output_path=OUT_PATH,
|
348 |
+
model=model,
|
349 |
+
train_samples=train_samples,
|
350 |
+
eval_samples=eval_samples,
|
351 |
+
)
|
352 |
+
trainer.fit()
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/trainer_0_log.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9eb020abfc0ef9798a6097596138d1567d58429ca6c2ce6e59b350acc5301cff
|
3 |
+
size 1771305
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a4a050e0d7a9c6c302b70b3f59dc195b12ad8922988de81bae55cbc1a89b9c8
|
3 |
+
size 347719275
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model_85001.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a4a050e0d7a9c6c302b70b3f59dc195b12ad8922988de81bae55cbc1a89b9c8
|
3 |
+
size 347719275
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/config.json
ADDED
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"output_path": "/raid/datasets/MUPE/Experiments/runs",
|
3 |
+
"logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
|
4 |
+
"run_name": "YourTTS-Baseline-PT",
|
5 |
+
"project_name": "SYNTACC",
|
6 |
+
"run_description": "\n - YourTTS with SYNTACC text encoder\n ",
|
7 |
+
"print_step": 50,
|
8 |
+
"plot_step": 100,
|
9 |
+
"model_param_stats": false,
|
10 |
+
"wandb_entity": null,
|
11 |
+
"dashboard_logger": "clearml",
|
12 |
+
"save_on_interrupt": true,
|
13 |
+
"log_model_step": 1000,
|
14 |
+
"save_step": 5000,
|
15 |
+
"save_n_checkpoints": 2,
|
16 |
+
"save_checkpoints": true,
|
17 |
+
"save_all_best": false,
|
18 |
+
"save_best_after": 10000,
|
19 |
+
"target_loss": null,
|
20 |
+
"print_eval": false,
|
21 |
+
"test_delay_epochs": 0,
|
22 |
+
"run_eval": true,
|
23 |
+
"run_eval_steps": null,
|
24 |
+
"distributed_backend": "nccl",
|
25 |
+
"distributed_url": "tcp://localhost:54321",
|
26 |
+
"mixed_precision": false,
|
27 |
+
"precision": "fp16",
|
28 |
+
"epochs": 1000,
|
29 |
+
"batch_size": 26,
|
30 |
+
"eval_batch_size": 26,
|
31 |
+
"grad_clip": [
|
32 |
+
1000,
|
33 |
+
1000
|
34 |
+
],
|
35 |
+
"scheduler_after_epoch": true,
|
36 |
+
"lr": 0.001,
|
37 |
+
"optimizer": "AdamW",
|
38 |
+
"optimizer_params": {
|
39 |
+
"betas": [
|
40 |
+
0.8,
|
41 |
+
0.99
|
42 |
+
],
|
43 |
+
"eps": 1e-09,
|
44 |
+
"weight_decay": 0.01
|
45 |
+
},
|
46 |
+
"lr_scheduler": null,
|
47 |
+
"lr_scheduler_params": {},
|
48 |
+
"use_grad_scaler": false,
|
49 |
+
"allow_tf32": false,
|
50 |
+
"cudnn_enable": true,
|
51 |
+
"cudnn_deterministic": false,
|
52 |
+
"cudnn_benchmark": false,
|
53 |
+
"training_seed": 54321,
|
54 |
+
"model": "vits",
|
55 |
+
"num_loader_workers": 8,
|
56 |
+
"num_eval_loader_workers": 0,
|
57 |
+
"use_noise_augment": false,
|
58 |
+
"audio": {
|
59 |
+
"fft_size": 1024,
|
60 |
+
"sample_rate": 16000,
|
61 |
+
"win_length": 1024,
|
62 |
+
"hop_length": 256,
|
63 |
+
"num_mels": 80,
|
64 |
+
"mel_fmin": 0.0,
|
65 |
+
"mel_fmax": null
|
66 |
+
},
|
67 |
+
"use_phonemes": false,
|
68 |
+
"phonemizer": "espeak",
|
69 |
+
"phoneme_language": "en",
|
70 |
+
"compute_input_seq_cache": true,
|
71 |
+
"text_cleaner": "multilingual_cleaners",
|
72 |
+
"enable_eos_bos_chars": false,
|
73 |
+
"test_sentences_file": "",
|
74 |
+
"phoneme_cache_path": null,
|
75 |
+
"characters": {
|
76 |
+
"characters_class": "TTS.tts.models.vits.VitsCharacters",
|
77 |
+
"vocab_dict": null,
|
78 |
+
"pad": "_",
|
79 |
+
"eos": "&",
|
80 |
+
"bos": "*",
|
81 |
+
"blank": null,
|
82 |
+
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
83 |
+
"punctuations": "\u2014!'(),-.:;?\u00bf ",
|
84 |
+
"phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
85 |
+
"is_unique": true,
|
86 |
+
"is_sorted": true
|
87 |
+
},
|
88 |
+
"add_blank": true,
|
89 |
+
"batch_group_size": 48,
|
90 |
+
"loss_masking": null,
|
91 |
+
"min_audio_len": 1,
|
92 |
+
"max_audio_len": Infinity,
|
93 |
+
"min_text_len": 1,
|
94 |
+
"max_text_len": Infinity,
|
95 |
+
"compute_f0": false,
|
96 |
+
"compute_energy": false,
|
97 |
+
"compute_linear_spec": true,
|
98 |
+
"precompute_num_workers": 12,
|
99 |
+
"start_by_longest": true,
|
100 |
+
"shuffle": false,
|
101 |
+
"drop_last": false,
|
102 |
+
"datasets": [
|
103 |
+
{
|
104 |
+
"formatter": "coqui",
|
105 |
+
"dataset_name": "mupe",
|
106 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
107 |
+
"meta_file_train": "metadata_coqui_brpb.csv",
|
108 |
+
"ignored_speakers": null,
|
109 |
+
"language": "brpb",
|
110 |
+
"phonemizer": "",
|
111 |
+
"meta_file_val": "",
|
112 |
+
"meta_file_attn_mask": ""
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"formatter": "coqui",
|
116 |
+
"dataset_name": "mupe",
|
117 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
118 |
+
"meta_file_train": "metadata_coqui_brba.csv",
|
119 |
+
"ignored_speakers": null,
|
120 |
+
"language": "brba",
|
121 |
+
"phonemizer": "",
|
122 |
+
"meta_file_val": "",
|
123 |
+
"meta_file_attn_mask": ""
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"formatter": "coqui",
|
127 |
+
"dataset_name": "mupe",
|
128 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
129 |
+
"meta_file_train": "metadata_coqui_brportugal.csv",
|
130 |
+
"ignored_speakers": null,
|
131 |
+
"language": "brportugal",
|
132 |
+
"phonemizer": "",
|
133 |
+
"meta_file_val": "",
|
134 |
+
"meta_file_attn_mask": ""
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"formatter": "coqui",
|
138 |
+
"dataset_name": "mupe",
|
139 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
140 |
+
"meta_file_train": "metadata_coqui_brsp.csv",
|
141 |
+
"ignored_speakers": null,
|
142 |
+
"language": "brsp",
|
143 |
+
"phonemizer": "",
|
144 |
+
"meta_file_val": "",
|
145 |
+
"meta_file_attn_mask": ""
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"formatter": "coqui",
|
149 |
+
"dataset_name": "mupe",
|
150 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
151 |
+
"meta_file_train": "metadata_coqui_brpe.csv",
|
152 |
+
"ignored_speakers": null,
|
153 |
+
"language": "brpe",
|
154 |
+
"phonemizer": "",
|
155 |
+
"meta_file_val": "",
|
156 |
+
"meta_file_attn_mask": ""
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"formatter": "coqui",
|
160 |
+
"dataset_name": "mupe",
|
161 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
162 |
+
"meta_file_train": "metadata_coqui_brmg.csv",
|
163 |
+
"ignored_speakers": null,
|
164 |
+
"language": "brmg",
|
165 |
+
"phonemizer": "",
|
166 |
+
"meta_file_val": "",
|
167 |
+
"meta_file_attn_mask": ""
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"formatter": "coqui",
|
171 |
+
"dataset_name": "mupe",
|
172 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
173 |
+
"meta_file_train": "metadata_coqui_brrj.csv",
|
174 |
+
"ignored_speakers": null,
|
175 |
+
"language": "brrj",
|
176 |
+
"phonemizer": "",
|
177 |
+
"meta_file_val": "",
|
178 |
+
"meta_file_attn_mask": ""
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"formatter": "coqui",
|
182 |
+
"dataset_name": "mupe",
|
183 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
184 |
+
"meta_file_train": "metadata_coqui_brce.csv",
|
185 |
+
"ignored_speakers": null,
|
186 |
+
"language": "brce",
|
187 |
+
"phonemizer": "",
|
188 |
+
"meta_file_val": "",
|
189 |
+
"meta_file_attn_mask": ""
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"formatter": "coqui",
|
193 |
+
"dataset_name": "mupe",
|
194 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
195 |
+
"meta_file_train": "metadata_coqui_brrs.csv",
|
196 |
+
"ignored_speakers": null,
|
197 |
+
"language": "brrs",
|
198 |
+
"phonemizer": "",
|
199 |
+
"meta_file_val": "",
|
200 |
+
"meta_file_attn_mask": ""
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"formatter": "coqui",
|
204 |
+
"dataset_name": "mupe",
|
205 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
206 |
+
"meta_file_train": "metadata_coqui_bralemanha.csv",
|
207 |
+
"ignored_speakers": null,
|
208 |
+
"language": "bralemanha",
|
209 |
+
"phonemizer": "",
|
210 |
+
"meta_file_val": "",
|
211 |
+
"meta_file_attn_mask": ""
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"formatter": "coqui",
|
215 |
+
"dataset_name": "mupe",
|
216 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
217 |
+
"meta_file_train": "metadata_coqui_brgo.csv",
|
218 |
+
"ignored_speakers": null,
|
219 |
+
"language": "brgo",
|
220 |
+
"phonemizer": "",
|
221 |
+
"meta_file_val": "",
|
222 |
+
"meta_file_attn_mask": ""
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"formatter": "coqui",
|
226 |
+
"dataset_name": "mupe",
|
227 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
228 |
+
"meta_file_train": "metadata_coqui_bral.csv",
|
229 |
+
"ignored_speakers": null,
|
230 |
+
"language": "bral",
|
231 |
+
"phonemizer": "",
|
232 |
+
"meta_file_val": "",
|
233 |
+
"meta_file_attn_mask": ""
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"formatter": "coqui",
|
237 |
+
"dataset_name": "mupe",
|
238 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
239 |
+
"meta_file_train": "metadata_coqui_brpr.csv",
|
240 |
+
"ignored_speakers": null,
|
241 |
+
"language": "brpr",
|
242 |
+
"phonemizer": "",
|
243 |
+
"meta_file_val": "",
|
244 |
+
"meta_file_attn_mask": ""
|
245 |
+
}
|
246 |
+
],
|
247 |
+
"test_sentences": [
|
248 |
+
[
|
249 |
+
"Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
|
250 |
+
"EDILEINE_FONSECA",
|
251 |
+
null,
|
252 |
+
"brsp"
|
253 |
+
],
|
254 |
+
[
|
255 |
+
"Quem semeia ventos, colhe tempestades.",
|
256 |
+
"JOSE_PAULO_DE_ARAUJO",
|
257 |
+
null,
|
258 |
+
"brpb"
|
259 |
+
],
|
260 |
+
[
|
261 |
+
"O olho do dono \u00e9 que engorda o gado.",
|
262 |
+
"VITOR_RAFAEL_OLIVEIRA_ALVES",
|
263 |
+
null,
|
264 |
+
"brba"
|
265 |
+
],
|
266 |
+
[
|
267 |
+
"\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
|
268 |
+
"MARIA_AURORA_FELIX",
|
269 |
+
null,
|
270 |
+
"brportugal"
|
271 |
+
],
|
272 |
+
[
|
273 |
+
"Quem espera sempre alcan\u00e7a.",
|
274 |
+
"ANTONIO_DE_AMORIM_COSTA",
|
275 |
+
null,
|
276 |
+
"brpe"
|
277 |
+
],
|
278 |
+
[
|
279 |
+
"Cada macaco no seu galho.",
|
280 |
+
"ALCIDES_DE_LIMA",
|
281 |
+
null,
|
282 |
+
"brmg"
|
283 |
+
],
|
284 |
+
[
|
285 |
+
"Em terra de cego, quem tem um olho \u00e9 rei.",
|
286 |
+
"ALUISIO_SOARES_DE_SOUSA",
|
287 |
+
null,
|
288 |
+
"brrj"
|
289 |
+
],
|
290 |
+
[
|
291 |
+
"A ocasi\u00e3o faz o ladr\u00e3o.",
|
292 |
+
"FRANCISCO_JOSE_MOREIRA_MOTA",
|
293 |
+
null,
|
294 |
+
"brce"
|
295 |
+
],
|
296 |
+
[
|
297 |
+
"De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
|
298 |
+
"EVALDO_ANDRADA_CORREA",
|
299 |
+
null,
|
300 |
+
"brrs"
|
301 |
+
],
|
302 |
+
[
|
303 |
+
"Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
|
304 |
+
"DORIS_ALEXANDER",
|
305 |
+
null,
|
306 |
+
"bralemanha"
|
307 |
+
],
|
308 |
+
[
|
309 |
+
"Quem n\u00e3o arrisca, n\u00e3o petisca.",
|
310 |
+
"DONALDO_LUIZ_DE_ALMEIDA",
|
311 |
+
null,
|
312 |
+
"brgo"
|
313 |
+
],
|
314 |
+
[
|
315 |
+
"A uni\u00e3o faz a for\u00e7a.",
|
316 |
+
"GERONCIO_HENRIQUE_NETO",
|
317 |
+
null,
|
318 |
+
"bral"
|
319 |
+
],
|
320 |
+
[
|
321 |
+
"Em boca fechada n\u00e3o entra mosquito.",
|
322 |
+
"MALU_NATEL_FREIRE_WEBER",
|
323 |
+
null,
|
324 |
+
"brpr"
|
325 |
+
]
|
326 |
+
],
|
327 |
+
"eval_split_max_size": 256,
|
328 |
+
"eval_split_size": 0.01,
|
329 |
+
"use_speaker_weighted_sampler": false,
|
330 |
+
"speaker_weighted_sampler_alpha": 1.0,
|
331 |
+
"use_language_weighted_sampler": false,
|
332 |
+
"language_weighted_sampler_alpha": 1.0,
|
333 |
+
"use_length_weighted_sampler": false,
|
334 |
+
"length_weighted_sampler_alpha": 1.0,
|
335 |
+
"model_args": {
|
336 |
+
"num_chars": 266,
|
337 |
+
"out_channels": 513,
|
338 |
+
"spec_segment_size": 62,
|
339 |
+
"hidden_channels": 192,
|
340 |
+
"use_adaptive_weight_text_encoder": false,
|
341 |
+
"use_perfect_class_batch_sampler": true,
|
342 |
+
"perfect_class_batch_sampler_key": "language",
|
343 |
+
"hidden_channels_ffn_text_encoder": 768,
|
344 |
+
"num_heads_text_encoder": 2,
|
345 |
+
"num_layers_text_encoder": 10,
|
346 |
+
"kernel_size_text_encoder": 3,
|
347 |
+
"dropout_p_text_encoder": 0.1,
|
348 |
+
"dropout_p_duration_predictor": 0.5,
|
349 |
+
"kernel_size_posterior_encoder": 5,
|
350 |
+
"dilation_rate_posterior_encoder": 1,
|
351 |
+
"num_layers_posterior_encoder": 16,
|
352 |
+
"kernel_size_flow": 5,
|
353 |
+
"dilation_rate_flow": 1,
|
354 |
+
"num_layers_flow": 4,
|
355 |
+
"resblock_type_decoder": "2",
|
356 |
+
"resblock_kernel_sizes_decoder": [
|
357 |
+
3,
|
358 |
+
7,
|
359 |
+
11
|
360 |
+
],
|
361 |
+
"resblock_dilation_sizes_decoder": [
|
362 |
+
[
|
363 |
+
1,
|
364 |
+
3,
|
365 |
+
5
|
366 |
+
],
|
367 |
+
[
|
368 |
+
1,
|
369 |
+
3,
|
370 |
+
5
|
371 |
+
],
|
372 |
+
[
|
373 |
+
1,
|
374 |
+
3,
|
375 |
+
5
|
376 |
+
]
|
377 |
+
],
|
378 |
+
"upsample_rates_decoder": [
|
379 |
+
8,
|
380 |
+
8,
|
381 |
+
2,
|
382 |
+
2
|
383 |
+
],
|
384 |
+
"upsample_initial_channel_decoder": 512,
|
385 |
+
"upsample_kernel_sizes_decoder": [
|
386 |
+
16,
|
387 |
+
16,
|
388 |
+
4,
|
389 |
+
4
|
390 |
+
],
|
391 |
+
"periods_multi_period_discriminator": [
|
392 |
+
2,
|
393 |
+
3,
|
394 |
+
5,
|
395 |
+
7,
|
396 |
+
11
|
397 |
+
],
|
398 |
+
"use_sdp": true,
|
399 |
+
"noise_scale": 1.0,
|
400 |
+
"inference_noise_scale": 0.667,
|
401 |
+
"length_scale": 1,
|
402 |
+
"noise_scale_dp": 1.0,
|
403 |
+
"inference_noise_scale_dp": 1.0,
|
404 |
+
"max_inference_len": null,
|
405 |
+
"init_discriminator": true,
|
406 |
+
"use_spectral_norm_disriminator": false,
|
407 |
+
"use_speaker_embedding": false,
|
408 |
+
"num_speakers": 0,
|
409 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth",
|
410 |
+
"d_vector_file": [
|
411 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
412 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
413 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
414 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
415 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
416 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
417 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
418 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
419 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
420 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
421 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
422 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
423 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
424 |
+
],
|
425 |
+
"speaker_embedding_channels": 256,
|
426 |
+
"use_d_vector_file": true,
|
427 |
+
"d_vector_dim": 512,
|
428 |
+
"detach_dp_input": true,
|
429 |
+
"use_language_embedding": true,
|
430 |
+
"embedded_language_dim": 4,
|
431 |
+
"num_languages": 0,
|
432 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json",
|
433 |
+
"use_speaker_encoder_as_loss": false,
|
434 |
+
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
435 |
+
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
436 |
+
"condition_dp_on_speaker": true,
|
437 |
+
"freeze_encoder": false,
|
438 |
+
"freeze_DP": false,
|
439 |
+
"freeze_PE": false,
|
440 |
+
"freeze_flow_decoder": false,
|
441 |
+
"freeze_waveform_decoder": false,
|
442 |
+
"encoder_sample_rate": null,
|
443 |
+
"interpolate_z": true,
|
444 |
+
"reinit_DP": false,
|
445 |
+
"reinit_text_encoder": false
|
446 |
+
},
|
447 |
+
"lr_gen": 0.0002,
|
448 |
+
"lr_disc": 0.0002,
|
449 |
+
"lr_scheduler_gen": "ExponentialLR",
|
450 |
+
"lr_scheduler_gen_params": {
|
451 |
+
"gamma": 0.999875,
|
452 |
+
"last_epoch": -1
|
453 |
+
},
|
454 |
+
"lr_scheduler_disc": "ExponentialLR",
|
455 |
+
"lr_scheduler_disc_params": {
|
456 |
+
"gamma": 0.999875,
|
457 |
+
"last_epoch": -1
|
458 |
+
},
|
459 |
+
"kl_loss_alpha": 1.0,
|
460 |
+
"disc_loss_alpha": 1.0,
|
461 |
+
"gen_loss_alpha": 1.0,
|
462 |
+
"feat_loss_alpha": 1.0,
|
463 |
+
"mel_loss_alpha": 45.0,
|
464 |
+
"dur_loss_alpha": 1.0,
|
465 |
+
"speaker_encoder_loss_alpha": 9.0,
|
466 |
+
"return_wav": true,
|
467 |
+
"use_weighted_sampler": true,
|
468 |
+
"weighted_sampler_attrs": {
|
469 |
+
"language": 1.0
|
470 |
+
},
|
471 |
+
"weighted_sampler_multipliers": {},
|
472 |
+
"r": 1,
|
473 |
+
"num_speakers": 0,
|
474 |
+
"use_speaker_embedding": false,
|
475 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth",
|
476 |
+
"speaker_embedding_channels": 256,
|
477 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json",
|
478 |
+
"use_language_embedding": true,
|
479 |
+
"use_d_vector_file": true,
|
480 |
+
"d_vector_file": [
|
481 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
482 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
483 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
484 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
485 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
486 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
487 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
488 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
489 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
490 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
491 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
492 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
493 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
494 |
+
],
|
495 |
+
"d_vector_dim": 512
|
496 |
+
}
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bral": 0,
|
3 |
+
"bralemanha": 1,
|
4 |
+
"brba": 2,
|
5 |
+
"brce": 3,
|
6 |
+
"brgo": 4,
|
7 |
+
"brmg": 5,
|
8 |
+
"brpb": 6,
|
9 |
+
"brpe": 7,
|
10 |
+
"brportugal": 8,
|
11 |
+
"brpr": 9,
|
12 |
+
"brrj": 10,
|
13 |
+
"brrs": 11,
|
14 |
+
"brsp": 12
|
15 |
+
}
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
|
3 |
+
size 3296
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/train_syntacc_baseline.py
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from trainer import Trainer, TrainerArgs
|
5 |
+
|
6 |
+
from TTS.bin.compute_embeddings import compute_embeddings
|
7 |
+
from TTS.bin.resample import resample_files
|
8 |
+
from TTS.config.shared_configs import BaseDatasetConfig
|
9 |
+
from TTS.tts.configs.vits_config import VitsConfig
|
10 |
+
from TTS.tts.datasets import load_tts_samples
|
11 |
+
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
|
12 |
+
from TTS.utils.downloaders import download_libri_tts
|
13 |
+
from torch.utils.data import DataLoader
|
14 |
+
from TTS.utils.samplers import PerfectBatchSampler
|
15 |
+
torch.set_num_threads(24)
|
16 |
+
|
17 |
+
# pylint: disable=W0105
|
18 |
+
"""
|
19 |
+
This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
|
20 |
+
YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
|
21 |
+
"""
|
22 |
+
CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
|
23 |
+
|
24 |
+
# Name of the run for the Trainer
|
25 |
+
RUN_NAME = "YourTTS-Baseline-PT"
|
26 |
+
|
27 |
+
# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
|
28 |
+
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
29 |
+
|
30 |
+
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
31 |
+
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
32 |
+
|
33 |
+
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
34 |
+
SKIP_TRAIN_EPOCH = False
|
35 |
+
|
36 |
+
# Set here the batch size to be used in training and evaluation
|
37 |
+
BATCH_SIZE = 26
|
38 |
+
|
39 |
+
# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
|
40 |
+
# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
|
41 |
+
SAMPLE_RATE = 16000
|
42 |
+
|
43 |
+
|
44 |
+
DASHBOARD_LOGGER="tensorboard"
|
45 |
+
LOGGER_URI = None
|
46 |
+
|
47 |
+
DASHBOARD_LOGGER = "clearml"
|
48 |
+
LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
|
53 |
+
MAX_AUDIO_LEN_IN_SECONDS = float("inf")
|
54 |
+
|
55 |
+
# Define here the datasets config
|
56 |
+
brpb_train_config = BaseDatasetConfig(
|
57 |
+
formatter="coqui",
|
58 |
+
dataset_name="mupe",
|
59 |
+
meta_file_train="metadata_coqui_brpb.csv",
|
60 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
61 |
+
language="brpb"
|
62 |
+
)
|
63 |
+
|
64 |
+
brba_train_config = BaseDatasetConfig(
|
65 |
+
formatter="coqui",
|
66 |
+
dataset_name="mupe",
|
67 |
+
meta_file_train="metadata_coqui_brba.csv",
|
68 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
69 |
+
language="brba"
|
70 |
+
)
|
71 |
+
|
72 |
+
brportugal_train_config = BaseDatasetConfig(
|
73 |
+
formatter="coqui",
|
74 |
+
dataset_name="mupe",
|
75 |
+
meta_file_train="metadata_coqui_brportugal.csv",
|
76 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
77 |
+
language="brportugal"
|
78 |
+
)
|
79 |
+
|
80 |
+
brsp_train_config = BaseDatasetConfig(
|
81 |
+
formatter="coqui",
|
82 |
+
dataset_name="mupe",
|
83 |
+
meta_file_train="metadata_coqui_brsp.csv",
|
84 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
85 |
+
language="brsp"
|
86 |
+
)
|
87 |
+
|
88 |
+
brpe_train_config = BaseDatasetConfig(
|
89 |
+
formatter="coqui",
|
90 |
+
dataset_name="mupe",
|
91 |
+
meta_file_train="metadata_coqui_brpe.csv",
|
92 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
93 |
+
language="brpe"
|
94 |
+
)
|
95 |
+
|
96 |
+
brmg_train_config = BaseDatasetConfig(
|
97 |
+
formatter="coqui",
|
98 |
+
dataset_name="mupe",
|
99 |
+
meta_file_train="metadata_coqui_brmg.csv",
|
100 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
101 |
+
language="brmg"
|
102 |
+
)
|
103 |
+
|
104 |
+
brrj_train_config = BaseDatasetConfig(
|
105 |
+
formatter="coqui",
|
106 |
+
dataset_name="mupe",
|
107 |
+
meta_file_train="metadata_coqui_brrj.csv",
|
108 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
109 |
+
language="brrj"
|
110 |
+
)
|
111 |
+
|
112 |
+
brce_train_config = BaseDatasetConfig(
|
113 |
+
formatter="coqui",
|
114 |
+
dataset_name="mupe",
|
115 |
+
meta_file_train="metadata_coqui_brce.csv",
|
116 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
117 |
+
language="brce"
|
118 |
+
)
|
119 |
+
|
120 |
+
brrs_train_config = BaseDatasetConfig(
|
121 |
+
formatter="coqui",
|
122 |
+
dataset_name="mupe",
|
123 |
+
meta_file_train="metadata_coqui_brrs.csv",
|
124 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
125 |
+
language="brrs"
|
126 |
+
)
|
127 |
+
|
128 |
+
bralemanha_train_config = BaseDatasetConfig(
|
129 |
+
formatter="coqui",
|
130 |
+
dataset_name="mupe",
|
131 |
+
meta_file_train="metadata_coqui_bralemanha.csv",
|
132 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
133 |
+
language="bralemanha"
|
134 |
+
)
|
135 |
+
|
136 |
+
brgo_train_config = BaseDatasetConfig(
|
137 |
+
formatter="coqui",
|
138 |
+
dataset_name="mupe",
|
139 |
+
meta_file_train="metadata_coqui_brgo.csv",
|
140 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
141 |
+
language="brgo"
|
142 |
+
)
|
143 |
+
|
144 |
+
bral_train_config = BaseDatasetConfig(
|
145 |
+
formatter="coqui",
|
146 |
+
dataset_name="mupe",
|
147 |
+
meta_file_train="metadata_coqui_bral.csv",
|
148 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
149 |
+
language="bral"
|
150 |
+
)
|
151 |
+
|
152 |
+
brpr_train_config = BaseDatasetConfig(
|
153 |
+
formatter="coqui",
|
154 |
+
dataset_name="mupe",
|
155 |
+
meta_file_train="metadata_coqui_brpr.csv",
|
156 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
157 |
+
language="brpr"
|
158 |
+
)
|
159 |
+
|
160 |
+
bres_train_config = BaseDatasetConfig(
|
161 |
+
formatter="coqui",
|
162 |
+
dataset_name="mupe",
|
163 |
+
meta_file_train="metadata_coqui_bres.csv",
|
164 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
165 |
+
language="bres"
|
166 |
+
)
|
167 |
+
|
168 |
+
brpi_train_config = BaseDatasetConfig(
|
169 |
+
formatter="coqui",
|
170 |
+
dataset_name="mupe",
|
171 |
+
meta_file_train="metadata_coqui_brpi.csv",
|
172 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
173 |
+
language="brpi"
|
174 |
+
)
|
175 |
+
|
176 |
+
# bres_train_config, brpi_train_config no files found
|
177 |
+
DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
|
178 |
+
|
179 |
+
|
180 |
+
### Extract speaker embeddings
|
181 |
+
SPEAKER_ENCODER_CHECKPOINT_PATH = (
|
182 |
+
"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
|
183 |
+
)
|
184 |
+
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
|
185 |
+
|
186 |
+
D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
|
187 |
+
|
188 |
+
# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
|
189 |
+
for dataset_conf in DATASETS_CONFIG_LIST:
|
190 |
+
# Check if the embeddings weren't already computed, if not compute it
|
191 |
+
embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
|
192 |
+
if not os.path.isfile(embeddings_file):
|
193 |
+
print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
|
194 |
+
compute_embeddings(
|
195 |
+
SPEAKER_ENCODER_CHECKPOINT_PATH,
|
196 |
+
SPEAKER_ENCODER_CONFIG_PATH,
|
197 |
+
embeddings_file,
|
198 |
+
old_speakers_file=None,
|
199 |
+
config_dataset_path=None,
|
200 |
+
formatter_name=dataset_conf.formatter,
|
201 |
+
dataset_name=dataset_conf.dataset_name,
|
202 |
+
dataset_path=dataset_conf.path,
|
203 |
+
meta_file_train=dataset_conf.meta_file_train,
|
204 |
+
meta_file_val=dataset_conf.meta_file_val,
|
205 |
+
disable_cuda=False,
|
206 |
+
no_eval=False,
|
207 |
+
)
|
208 |
+
D_VECTOR_FILES.append(embeddings_file)
|
209 |
+
|
210 |
+
|
211 |
+
# Audio config used in training.
|
212 |
+
audio_config = VitsAudioConfig(
|
213 |
+
sample_rate=SAMPLE_RATE,
|
214 |
+
hop_length=256,
|
215 |
+
win_length=1024,
|
216 |
+
fft_size=1024,
|
217 |
+
mel_fmin=0.0,
|
218 |
+
mel_fmax=None,
|
219 |
+
num_mels=80,
|
220 |
+
)
|
221 |
+
|
222 |
+
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
223 |
+
model_args = VitsArgs(
|
224 |
+
spec_segment_size=62,
|
225 |
+
hidden_channels=192,
|
226 |
+
hidden_channels_ffn_text_encoder=768,
|
227 |
+
num_heads_text_encoder=2,
|
228 |
+
num_layers_text_encoder=10,
|
229 |
+
kernel_size_text_encoder=3,
|
230 |
+
dropout_p_text_encoder=0.1,
|
231 |
+
d_vector_file=D_VECTOR_FILES,
|
232 |
+
use_d_vector_file=True,
|
233 |
+
d_vector_dim=512,
|
234 |
+
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
235 |
+
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
236 |
+
resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
237 |
+
# Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
|
238 |
+
use_speaker_encoder_as_loss=False,
|
239 |
+
# Useful parameters to enable multilingual training
|
240 |
+
use_language_embedding=True,
|
241 |
+
embedded_language_dim=4,
|
242 |
+
use_adaptive_weight_text_encoder=False,
|
243 |
+
use_perfect_class_batch_sampler=True,
|
244 |
+
perfect_class_batch_sampler_key="language"
|
245 |
+
)
|
246 |
+
|
247 |
+
# General training config, here you can change the batch size and others useful parameters
|
248 |
+
config = VitsConfig(
|
249 |
+
output_path=OUT_PATH,
|
250 |
+
model_args=model_args,
|
251 |
+
run_name=RUN_NAME,
|
252 |
+
project_name="SYNTACC",
|
253 |
+
run_description="""
|
254 |
+
- YourTTS with SYNTACC text encoder
|
255 |
+
""",
|
256 |
+
dashboard_logger=DASHBOARD_LOGGER,
|
257 |
+
logger_uri=LOGGER_URI,
|
258 |
+
audio=audio_config,
|
259 |
+
batch_size=BATCH_SIZE,
|
260 |
+
batch_group_size=48,
|
261 |
+
eval_batch_size=BATCH_SIZE,
|
262 |
+
num_loader_workers=8,
|
263 |
+
eval_split_max_size=256,
|
264 |
+
print_step=50,
|
265 |
+
plot_step=100,
|
266 |
+
log_model_step=1000,
|
267 |
+
save_step=5000,
|
268 |
+
save_n_checkpoints=2,
|
269 |
+
save_checkpoints=True,
|
270 |
+
# target_loss="loss_1",
|
271 |
+
print_eval=False,
|
272 |
+
use_phonemes=False,
|
273 |
+
phonemizer="espeak",
|
274 |
+
phoneme_language="en",
|
275 |
+
compute_input_seq_cache=True,
|
276 |
+
add_blank=True,
|
277 |
+
text_cleaner="multilingual_cleaners",
|
278 |
+
characters=CharactersConfig(
|
279 |
+
characters_class="TTS.tts.models.vits.VitsCharacters",
|
280 |
+
pad="_",
|
281 |
+
eos="&",
|
282 |
+
bos="*",
|
283 |
+
blank=None,
|
284 |
+
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
285 |
+
punctuations="\u2014!'(),-.:;?\u00bf ",
|
286 |
+
phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
287 |
+
is_unique=True,
|
288 |
+
is_sorted=True,
|
289 |
+
),
|
290 |
+
phoneme_cache_path=None,
|
291 |
+
precompute_num_workers=12,
|
292 |
+
start_by_longest=True,
|
293 |
+
datasets=DATASETS_CONFIG_LIST,
|
294 |
+
cudnn_benchmark=False,
|
295 |
+
max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
|
296 |
+
mixed_precision=False,
|
297 |
+
test_sentences=[
|
298 |
+
#GUSTAVO: apenas pessoas do treino
|
299 |
+
["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
|
300 |
+
["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
|
301 |
+
["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
|
302 |
+
["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
|
303 |
+
["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
|
304 |
+
["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
|
305 |
+
["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
|
306 |
+
["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
|
307 |
+
["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
|
308 |
+
["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
|
309 |
+
["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
|
310 |
+
["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
|
311 |
+
["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
|
312 |
+
# ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
|
313 |
+
# ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
|
314 |
+
],
|
315 |
+
# Enable the weighted sampler
|
316 |
+
use_weighted_sampler=True,
|
317 |
+
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
318 |
+
# weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
|
319 |
+
weighted_sampler_attrs={"language": 1.0},
|
320 |
+
weighted_sampler_multipliers={
|
321 |
+
# "speaker_name": {
|
322 |
+
# you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
|
323 |
+
# It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
|
324 |
+
# The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
|
325 |
+
# 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
|
326 |
+
# }
|
327 |
+
},
|
328 |
+
# It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
|
329 |
+
speaker_encoder_loss_alpha=9.0,
|
330 |
+
)
|
331 |
+
|
332 |
+
# Load all the datasets samples and split traning and evaluation sets
|
333 |
+
train_samples, eval_samples = load_tts_samples(
|
334 |
+
config.datasets,
|
335 |
+
eval_split=True,
|
336 |
+
eval_split_max_size=config.eval_split_max_size,
|
337 |
+
eval_split_size=config.eval_split_size,
|
338 |
+
)
|
339 |
+
|
340 |
+
# Init the model
|
341 |
+
model = Vits.init_from_config(config)
|
342 |
+
|
343 |
+
# Init the trainer and 🚀
|
344 |
+
trainer = Trainer(
|
345 |
+
TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
|
346 |
+
config,
|
347 |
+
output_path=OUT_PATH,
|
348 |
+
model=model,
|
349 |
+
train_samples=train_samples,
|
350 |
+
eval_samples=eval_samples,
|
351 |
+
)
|
352 |
+
trainer.fit()
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/trainer_0_log.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:94c095ee47fd6e763ee0e129a7728cf80e5e4f21301e767ab0141c478d369b89
|
3 |
+
size 128993
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a082ddde12d21020f66a70cf05a74826488d10008a8379b699458d92509e85d1
|
3 |
+
size 1043216142
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model_87192.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a082ddde12d21020f66a70cf05a74826488d10008a8379b699458d92509e85d1
|
3 |
+
size 1043216142
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_130000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a584eb832a857f9a11180b34a84b81117d8690ed1e5fa39e4ff711cf6ffd7f7
|
3 |
+
size 1043220766
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_135000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:367ac46477805942658a7a78e8cf473409537967f9382a46249a8d11521ed3f9
|
3 |
+
size 1043220766
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/config.json
ADDED
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"output_path": "/raid/datasets/MUPE/Experiments/runs",
|
3 |
+
"logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
|
4 |
+
"run_name": "YourTTS-Baseline-PT",
|
5 |
+
"project_name": "SYNTACC",
|
6 |
+
"run_description": "\n - YourTTS with SYNTACC text encoder\n ",
|
7 |
+
"print_step": 50,
|
8 |
+
"plot_step": 100,
|
9 |
+
"model_param_stats": false,
|
10 |
+
"wandb_entity": null,
|
11 |
+
"dashboard_logger": "clearml",
|
12 |
+
"save_on_interrupt": true,
|
13 |
+
"log_model_step": 1000,
|
14 |
+
"save_step": 5000,
|
15 |
+
"save_n_checkpoints": 2,
|
16 |
+
"save_checkpoints": true,
|
17 |
+
"save_all_best": false,
|
18 |
+
"save_best_after": 10000,
|
19 |
+
"target_loss": null,
|
20 |
+
"print_eval": false,
|
21 |
+
"test_delay_epochs": 0,
|
22 |
+
"run_eval": true,
|
23 |
+
"run_eval_steps": null,
|
24 |
+
"distributed_backend": "nccl",
|
25 |
+
"distributed_url": "tcp://localhost:54321",
|
26 |
+
"mixed_precision": false,
|
27 |
+
"precision": "fp16",
|
28 |
+
"epochs": 1000,
|
29 |
+
"batch_size": 26,
|
30 |
+
"eval_batch_size": 26,
|
31 |
+
"grad_clip": [
|
32 |
+
1000,
|
33 |
+
1000
|
34 |
+
],
|
35 |
+
"scheduler_after_epoch": true,
|
36 |
+
"lr": 0.001,
|
37 |
+
"optimizer": "AdamW",
|
38 |
+
"optimizer_params": {
|
39 |
+
"betas": [
|
40 |
+
0.8,
|
41 |
+
0.99
|
42 |
+
],
|
43 |
+
"eps": 1e-09,
|
44 |
+
"weight_decay": 0.01
|
45 |
+
},
|
46 |
+
"lr_scheduler": null,
|
47 |
+
"lr_scheduler_params": {},
|
48 |
+
"use_grad_scaler": false,
|
49 |
+
"allow_tf32": false,
|
50 |
+
"cudnn_enable": true,
|
51 |
+
"cudnn_deterministic": false,
|
52 |
+
"cudnn_benchmark": false,
|
53 |
+
"training_seed": 54321,
|
54 |
+
"model": "vits",
|
55 |
+
"num_loader_workers": 8,
|
56 |
+
"num_eval_loader_workers": 0,
|
57 |
+
"use_noise_augment": false,
|
58 |
+
"audio": {
|
59 |
+
"fft_size": 1024,
|
60 |
+
"sample_rate": 16000,
|
61 |
+
"win_length": 1024,
|
62 |
+
"hop_length": 256,
|
63 |
+
"num_mels": 80,
|
64 |
+
"mel_fmin": 0.0,
|
65 |
+
"mel_fmax": null
|
66 |
+
},
|
67 |
+
"use_phonemes": false,
|
68 |
+
"phonemizer": "espeak",
|
69 |
+
"phoneme_language": "en",
|
70 |
+
"compute_input_seq_cache": true,
|
71 |
+
"text_cleaner": "multilingual_cleaners",
|
72 |
+
"enable_eos_bos_chars": false,
|
73 |
+
"test_sentences_file": "",
|
74 |
+
"phoneme_cache_path": null,
|
75 |
+
"characters": {
|
76 |
+
"characters_class": "TTS.tts.models.vits.VitsCharacters",
|
77 |
+
"vocab_dict": null,
|
78 |
+
"pad": "_",
|
79 |
+
"eos": "&",
|
80 |
+
"bos": "*",
|
81 |
+
"blank": null,
|
82 |
+
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
83 |
+
"punctuations": "\u2014!'(),-.:;?\u00bf ",
|
84 |
+
"phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
85 |
+
"is_unique": true,
|
86 |
+
"is_sorted": true
|
87 |
+
},
|
88 |
+
"add_blank": true,
|
89 |
+
"batch_group_size": 48,
|
90 |
+
"loss_masking": null,
|
91 |
+
"min_audio_len": 1,
|
92 |
+
"max_audio_len": Infinity,
|
93 |
+
"min_text_len": 1,
|
94 |
+
"max_text_len": Infinity,
|
95 |
+
"compute_f0": false,
|
96 |
+
"compute_energy": false,
|
97 |
+
"compute_linear_spec": true,
|
98 |
+
"precompute_num_workers": 12,
|
99 |
+
"start_by_longest": true,
|
100 |
+
"shuffle": false,
|
101 |
+
"drop_last": false,
|
102 |
+
"datasets": [
|
103 |
+
{
|
104 |
+
"formatter": "coqui",
|
105 |
+
"dataset_name": "mupe",
|
106 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
107 |
+
"meta_file_train": "metadata_coqui_brpb.csv",
|
108 |
+
"ignored_speakers": null,
|
109 |
+
"language": "brpb",
|
110 |
+
"phonemizer": "",
|
111 |
+
"meta_file_val": "",
|
112 |
+
"meta_file_attn_mask": ""
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"formatter": "coqui",
|
116 |
+
"dataset_name": "mupe",
|
117 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
118 |
+
"meta_file_train": "metadata_coqui_brba.csv",
|
119 |
+
"ignored_speakers": null,
|
120 |
+
"language": "brba",
|
121 |
+
"phonemizer": "",
|
122 |
+
"meta_file_val": "",
|
123 |
+
"meta_file_attn_mask": ""
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"formatter": "coqui",
|
127 |
+
"dataset_name": "mupe",
|
128 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
129 |
+
"meta_file_train": "metadata_coqui_brportugal.csv",
|
130 |
+
"ignored_speakers": null,
|
131 |
+
"language": "brportugal",
|
132 |
+
"phonemizer": "",
|
133 |
+
"meta_file_val": "",
|
134 |
+
"meta_file_attn_mask": ""
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"formatter": "coqui",
|
138 |
+
"dataset_name": "mupe",
|
139 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
140 |
+
"meta_file_train": "metadata_coqui_brsp.csv",
|
141 |
+
"ignored_speakers": null,
|
142 |
+
"language": "brsp",
|
143 |
+
"phonemizer": "",
|
144 |
+
"meta_file_val": "",
|
145 |
+
"meta_file_attn_mask": ""
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"formatter": "coqui",
|
149 |
+
"dataset_name": "mupe",
|
150 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
151 |
+
"meta_file_train": "metadata_coqui_brpe.csv",
|
152 |
+
"ignored_speakers": null,
|
153 |
+
"language": "brpe",
|
154 |
+
"phonemizer": "",
|
155 |
+
"meta_file_val": "",
|
156 |
+
"meta_file_attn_mask": ""
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"formatter": "coqui",
|
160 |
+
"dataset_name": "mupe",
|
161 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
162 |
+
"meta_file_train": "metadata_coqui_brmg.csv",
|
163 |
+
"ignored_speakers": null,
|
164 |
+
"language": "brmg",
|
165 |
+
"phonemizer": "",
|
166 |
+
"meta_file_val": "",
|
167 |
+
"meta_file_attn_mask": ""
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"formatter": "coqui",
|
171 |
+
"dataset_name": "mupe",
|
172 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
173 |
+
"meta_file_train": "metadata_coqui_brrj.csv",
|
174 |
+
"ignored_speakers": null,
|
175 |
+
"language": "brrj",
|
176 |
+
"phonemizer": "",
|
177 |
+
"meta_file_val": "",
|
178 |
+
"meta_file_attn_mask": ""
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"formatter": "coqui",
|
182 |
+
"dataset_name": "mupe",
|
183 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
184 |
+
"meta_file_train": "metadata_coqui_brce.csv",
|
185 |
+
"ignored_speakers": null,
|
186 |
+
"language": "brce",
|
187 |
+
"phonemizer": "",
|
188 |
+
"meta_file_val": "",
|
189 |
+
"meta_file_attn_mask": ""
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"formatter": "coqui",
|
193 |
+
"dataset_name": "mupe",
|
194 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
195 |
+
"meta_file_train": "metadata_coqui_brrs.csv",
|
196 |
+
"ignored_speakers": null,
|
197 |
+
"language": "brrs",
|
198 |
+
"phonemizer": "",
|
199 |
+
"meta_file_val": "",
|
200 |
+
"meta_file_attn_mask": ""
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"formatter": "coqui",
|
204 |
+
"dataset_name": "mupe",
|
205 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
206 |
+
"meta_file_train": "metadata_coqui_bralemanha.csv",
|
207 |
+
"ignored_speakers": null,
|
208 |
+
"language": "bralemanha",
|
209 |
+
"phonemizer": "",
|
210 |
+
"meta_file_val": "",
|
211 |
+
"meta_file_attn_mask": ""
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"formatter": "coqui",
|
215 |
+
"dataset_name": "mupe",
|
216 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
217 |
+
"meta_file_train": "metadata_coqui_brgo.csv",
|
218 |
+
"ignored_speakers": null,
|
219 |
+
"language": "brgo",
|
220 |
+
"phonemizer": "",
|
221 |
+
"meta_file_val": "",
|
222 |
+
"meta_file_attn_mask": ""
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"formatter": "coqui",
|
226 |
+
"dataset_name": "mupe",
|
227 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
228 |
+
"meta_file_train": "metadata_coqui_bral.csv",
|
229 |
+
"ignored_speakers": null,
|
230 |
+
"language": "bral",
|
231 |
+
"phonemizer": "",
|
232 |
+
"meta_file_val": "",
|
233 |
+
"meta_file_attn_mask": ""
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"formatter": "coqui",
|
237 |
+
"dataset_name": "mupe",
|
238 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
239 |
+
"meta_file_train": "metadata_coqui_brpr.csv",
|
240 |
+
"ignored_speakers": null,
|
241 |
+
"language": "brpr",
|
242 |
+
"phonemizer": "",
|
243 |
+
"meta_file_val": "",
|
244 |
+
"meta_file_attn_mask": ""
|
245 |
+
}
|
246 |
+
],
|
247 |
+
"test_sentences": [
|
248 |
+
[
|
249 |
+
"Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
|
250 |
+
"EDILEINE_FONSECA",
|
251 |
+
null,
|
252 |
+
"brsp"
|
253 |
+
],
|
254 |
+
[
|
255 |
+
"Quem semeia ventos, colhe tempestades.",
|
256 |
+
"JOSE_PAULO_DE_ARAUJO",
|
257 |
+
null,
|
258 |
+
"brpb"
|
259 |
+
],
|
260 |
+
[
|
261 |
+
"O olho do dono \u00e9 que engorda o gado.",
|
262 |
+
"VITOR_RAFAEL_OLIVEIRA_ALVES",
|
263 |
+
null,
|
264 |
+
"brba"
|
265 |
+
],
|
266 |
+
[
|
267 |
+
"\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
|
268 |
+
"MARIA_AURORA_FELIX",
|
269 |
+
null,
|
270 |
+
"brportugal"
|
271 |
+
],
|
272 |
+
[
|
273 |
+
"Quem espera sempre alcan\u00e7a.",
|
274 |
+
"ANTONIO_DE_AMORIM_COSTA",
|
275 |
+
null,
|
276 |
+
"brpe"
|
277 |
+
],
|
278 |
+
[
|
279 |
+
"Cada macaco no seu galho.",
|
280 |
+
"ALCIDES_DE_LIMA",
|
281 |
+
null,
|
282 |
+
"brmg"
|
283 |
+
],
|
284 |
+
[
|
285 |
+
"Em terra de cego, quem tem um olho \u00e9 rei.",
|
286 |
+
"ALUISIO_SOARES_DE_SOUSA",
|
287 |
+
null,
|
288 |
+
"brrj"
|
289 |
+
],
|
290 |
+
[
|
291 |
+
"A ocasi\u00e3o faz o ladr\u00e3o.",
|
292 |
+
"FRANCISCO_JOSE_MOREIRA_MOTA",
|
293 |
+
null,
|
294 |
+
"brce"
|
295 |
+
],
|
296 |
+
[
|
297 |
+
"De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
|
298 |
+
"EVALDO_ANDRADA_CORREA",
|
299 |
+
null,
|
300 |
+
"brrs"
|
301 |
+
],
|
302 |
+
[
|
303 |
+
"Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
|
304 |
+
"DORIS_ALEXANDER",
|
305 |
+
null,
|
306 |
+
"bralemanha"
|
307 |
+
],
|
308 |
+
[
|
309 |
+
"Quem n\u00e3o arrisca, n\u00e3o petisca.",
|
310 |
+
"DONALDO_LUIZ_DE_ALMEIDA",
|
311 |
+
null,
|
312 |
+
"brgo"
|
313 |
+
],
|
314 |
+
[
|
315 |
+
"A uni\u00e3o faz a for\u00e7a.",
|
316 |
+
"GERONCIO_HENRIQUE_NETO",
|
317 |
+
null,
|
318 |
+
"bral"
|
319 |
+
],
|
320 |
+
[
|
321 |
+
"Em boca fechada n\u00e3o entra mosquito.",
|
322 |
+
"MALU_NATEL_FREIRE_WEBER",
|
323 |
+
null,
|
324 |
+
"brpr"
|
325 |
+
]
|
326 |
+
],
|
327 |
+
"eval_split_max_size": 256,
|
328 |
+
"eval_split_size": 0.01,
|
329 |
+
"use_speaker_weighted_sampler": false,
|
330 |
+
"speaker_weighted_sampler_alpha": 1.0,
|
331 |
+
"use_language_weighted_sampler": false,
|
332 |
+
"language_weighted_sampler_alpha": 1.0,
|
333 |
+
"use_length_weighted_sampler": false,
|
334 |
+
"length_weighted_sampler_alpha": 1.0,
|
335 |
+
"model_args": {
|
336 |
+
"num_chars": 266,
|
337 |
+
"out_channels": 513,
|
338 |
+
"spec_segment_size": 62,
|
339 |
+
"hidden_channels": 192,
|
340 |
+
"use_adaptive_weight_text_encoder": false,
|
341 |
+
"use_perfect_class_batch_sampler": true,
|
342 |
+
"perfect_class_batch_sampler_key": "language",
|
343 |
+
"hidden_channels_ffn_text_encoder": 768,
|
344 |
+
"num_heads_text_encoder": 2,
|
345 |
+
"num_layers_text_encoder": 10,
|
346 |
+
"kernel_size_text_encoder": 3,
|
347 |
+
"dropout_p_text_encoder": 0.1,
|
348 |
+
"dropout_p_duration_predictor": 0.5,
|
349 |
+
"kernel_size_posterior_encoder": 5,
|
350 |
+
"dilation_rate_posterior_encoder": 1,
|
351 |
+
"num_layers_posterior_encoder": 16,
|
352 |
+
"kernel_size_flow": 5,
|
353 |
+
"dilation_rate_flow": 1,
|
354 |
+
"num_layers_flow": 4,
|
355 |
+
"resblock_type_decoder": "2",
|
356 |
+
"resblock_kernel_sizes_decoder": [
|
357 |
+
3,
|
358 |
+
7,
|
359 |
+
11
|
360 |
+
],
|
361 |
+
"resblock_dilation_sizes_decoder": [
|
362 |
+
[
|
363 |
+
1,
|
364 |
+
3,
|
365 |
+
5
|
366 |
+
],
|
367 |
+
[
|
368 |
+
1,
|
369 |
+
3,
|
370 |
+
5
|
371 |
+
],
|
372 |
+
[
|
373 |
+
1,
|
374 |
+
3,
|
375 |
+
5
|
376 |
+
]
|
377 |
+
],
|
378 |
+
"upsample_rates_decoder": [
|
379 |
+
8,
|
380 |
+
8,
|
381 |
+
2,
|
382 |
+
2
|
383 |
+
],
|
384 |
+
"upsample_initial_channel_decoder": 512,
|
385 |
+
"upsample_kernel_sizes_decoder": [
|
386 |
+
16,
|
387 |
+
16,
|
388 |
+
4,
|
389 |
+
4
|
390 |
+
],
|
391 |
+
"periods_multi_period_discriminator": [
|
392 |
+
2,
|
393 |
+
3,
|
394 |
+
5,
|
395 |
+
7,
|
396 |
+
11
|
397 |
+
],
|
398 |
+
"use_sdp": true,
|
399 |
+
"noise_scale": 1.0,
|
400 |
+
"inference_noise_scale": 0.667,
|
401 |
+
"length_scale": 1,
|
402 |
+
"noise_scale_dp": 1.0,
|
403 |
+
"inference_noise_scale_dp": 1.0,
|
404 |
+
"max_inference_len": null,
|
405 |
+
"init_discriminator": true,
|
406 |
+
"use_spectral_norm_disriminator": false,
|
407 |
+
"use_speaker_embedding": false,
|
408 |
+
"num_speakers": 0,
|
409 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth",
|
410 |
+
"d_vector_file": [
|
411 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
412 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
413 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
414 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
415 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
416 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
417 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
418 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
419 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
420 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
421 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
422 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
423 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
424 |
+
],
|
425 |
+
"speaker_embedding_channels": 256,
|
426 |
+
"use_d_vector_file": true,
|
427 |
+
"d_vector_dim": 512,
|
428 |
+
"detach_dp_input": true,
|
429 |
+
"use_language_embedding": true,
|
430 |
+
"embedded_language_dim": 4,
|
431 |
+
"num_languages": 0,
|
432 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json",
|
433 |
+
"use_speaker_encoder_as_loss": false,
|
434 |
+
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
435 |
+
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
436 |
+
"condition_dp_on_speaker": true,
|
437 |
+
"freeze_encoder": false,
|
438 |
+
"freeze_DP": false,
|
439 |
+
"freeze_PE": false,
|
440 |
+
"freeze_flow_decoder": false,
|
441 |
+
"freeze_waveform_decoder": false,
|
442 |
+
"encoder_sample_rate": null,
|
443 |
+
"interpolate_z": true,
|
444 |
+
"reinit_DP": false,
|
445 |
+
"reinit_text_encoder": false
|
446 |
+
},
|
447 |
+
"lr_gen": 0.0002,
|
448 |
+
"lr_disc": 0.0002,
|
449 |
+
"lr_scheduler_gen": "ExponentialLR",
|
450 |
+
"lr_scheduler_gen_params": {
|
451 |
+
"gamma": 0.999875,
|
452 |
+
"last_epoch": -1
|
453 |
+
},
|
454 |
+
"lr_scheduler_disc": "ExponentialLR",
|
455 |
+
"lr_scheduler_disc_params": {
|
456 |
+
"gamma": 0.999875,
|
457 |
+
"last_epoch": -1
|
458 |
+
},
|
459 |
+
"kl_loss_alpha": 1.0,
|
460 |
+
"disc_loss_alpha": 1.0,
|
461 |
+
"gen_loss_alpha": 1.0,
|
462 |
+
"feat_loss_alpha": 1.0,
|
463 |
+
"mel_loss_alpha": 45.0,
|
464 |
+
"dur_loss_alpha": 1.0,
|
465 |
+
"speaker_encoder_loss_alpha": 9.0,
|
466 |
+
"return_wav": true,
|
467 |
+
"use_weighted_sampler": true,
|
468 |
+
"weighted_sampler_attrs": {
|
469 |
+
"language": 1.0
|
470 |
+
},
|
471 |
+
"weighted_sampler_multipliers": {},
|
472 |
+
"r": 1,
|
473 |
+
"num_speakers": 0,
|
474 |
+
"use_speaker_embedding": false,
|
475 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth",
|
476 |
+
"speaker_embedding_channels": 256,
|
477 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json",
|
478 |
+
"use_language_embedding": true,
|
479 |
+
"use_d_vector_file": true,
|
480 |
+
"d_vector_file": [
|
481 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
482 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
483 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
484 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
485 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
486 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
487 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
488 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
489 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
490 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
491 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
492 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
493 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
494 |
+
],
|
495 |
+
"d_vector_dim": 512
|
496 |
+
}
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bral": 0,
|
3 |
+
"bralemanha": 1,
|
4 |
+
"brba": 2,
|
5 |
+
"brce": 3,
|
6 |
+
"brgo": 4,
|
7 |
+
"brmg": 5,
|
8 |
+
"brpb": 6,
|
9 |
+
"brpe": 7,
|
10 |
+
"brportugal": 8,
|
11 |
+
"brpr": 9,
|
12 |
+
"brrj": 10,
|
13 |
+
"brrs": 11,
|
14 |
+
"brsp": 12
|
15 |
+
}
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
|
3 |
+
size 3296
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/train_syntacc_baseline.py
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from trainer import Trainer, TrainerArgs
|
5 |
+
|
6 |
+
from TTS.bin.compute_embeddings import compute_embeddings
|
7 |
+
from TTS.bin.resample import resample_files
|
8 |
+
from TTS.config.shared_configs import BaseDatasetConfig
|
9 |
+
from TTS.tts.configs.vits_config import VitsConfig
|
10 |
+
from TTS.tts.datasets import load_tts_samples
|
11 |
+
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
|
12 |
+
from TTS.utils.downloaders import download_libri_tts
|
13 |
+
from torch.utils.data import DataLoader
|
14 |
+
from TTS.utils.samplers import PerfectBatchSampler
|
15 |
+
torch.set_num_threads(24)
|
16 |
+
|
17 |
+
# pylint: disable=W0105
|
18 |
+
"""
|
19 |
+
This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
|
20 |
+
YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
|
21 |
+
"""
|
22 |
+
CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
|
23 |
+
|
24 |
+
# Name of the run for the Trainer
|
25 |
+
RUN_NAME = "YourTTS-Baseline-PT"
|
26 |
+
|
27 |
+
# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
|
28 |
+
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
29 |
+
|
30 |
+
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
31 |
+
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
32 |
+
|
33 |
+
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
34 |
+
SKIP_TRAIN_EPOCH = False
|
35 |
+
|
36 |
+
# Set here the batch size to be used in training and evaluation
|
37 |
+
BATCH_SIZE = 26
|
38 |
+
|
39 |
+
# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
|
40 |
+
# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
|
41 |
+
SAMPLE_RATE = 16000
|
42 |
+
|
43 |
+
|
44 |
+
DASHBOARD_LOGGER="tensorboard"
|
45 |
+
LOGGER_URI = None
|
46 |
+
|
47 |
+
DASHBOARD_LOGGER = "clearml"
|
48 |
+
LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
|
53 |
+
MAX_AUDIO_LEN_IN_SECONDS = float("inf")
|
54 |
+
|
55 |
+
# Define here the datasets config
|
56 |
+
brpb_train_config = BaseDatasetConfig(
|
57 |
+
formatter="coqui",
|
58 |
+
dataset_name="mupe",
|
59 |
+
meta_file_train="metadata_coqui_brpb.csv",
|
60 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
61 |
+
language="brpb"
|
62 |
+
)
|
63 |
+
|
64 |
+
brba_train_config = BaseDatasetConfig(
|
65 |
+
formatter="coqui",
|
66 |
+
dataset_name="mupe",
|
67 |
+
meta_file_train="metadata_coqui_brba.csv",
|
68 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
69 |
+
language="brba"
|
70 |
+
)
|
71 |
+
|
72 |
+
brportugal_train_config = BaseDatasetConfig(
|
73 |
+
formatter="coqui",
|
74 |
+
dataset_name="mupe",
|
75 |
+
meta_file_train="metadata_coqui_brportugal.csv",
|
76 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
77 |
+
language="brportugal"
|
78 |
+
)
|
79 |
+
|
80 |
+
brsp_train_config = BaseDatasetConfig(
|
81 |
+
formatter="coqui",
|
82 |
+
dataset_name="mupe",
|
83 |
+
meta_file_train="metadata_coqui_brsp.csv",
|
84 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
85 |
+
language="brsp"
|
86 |
+
)
|
87 |
+
|
88 |
+
brpe_train_config = BaseDatasetConfig(
|
89 |
+
formatter="coqui",
|
90 |
+
dataset_name="mupe",
|
91 |
+
meta_file_train="metadata_coqui_brpe.csv",
|
92 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
93 |
+
language="brpe"
|
94 |
+
)
|
95 |
+
|
96 |
+
brmg_train_config = BaseDatasetConfig(
|
97 |
+
formatter="coqui",
|
98 |
+
dataset_name="mupe",
|
99 |
+
meta_file_train="metadata_coqui_brmg.csv",
|
100 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
101 |
+
language="brmg"
|
102 |
+
)
|
103 |
+
|
104 |
+
brrj_train_config = BaseDatasetConfig(
|
105 |
+
formatter="coqui",
|
106 |
+
dataset_name="mupe",
|
107 |
+
meta_file_train="metadata_coqui_brrj.csv",
|
108 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
109 |
+
language="brrj"
|
110 |
+
)
|
111 |
+
|
112 |
+
brce_train_config = BaseDatasetConfig(
|
113 |
+
formatter="coqui",
|
114 |
+
dataset_name="mupe",
|
115 |
+
meta_file_train="metadata_coqui_brce.csv",
|
116 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
117 |
+
language="brce"
|
118 |
+
)
|
119 |
+
|
120 |
+
brrs_train_config = BaseDatasetConfig(
|
121 |
+
formatter="coqui",
|
122 |
+
dataset_name="mupe",
|
123 |
+
meta_file_train="metadata_coqui_brrs.csv",
|
124 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
125 |
+
language="brrs"
|
126 |
+
)
|
127 |
+
|
128 |
+
bralemanha_train_config = BaseDatasetConfig(
|
129 |
+
formatter="coqui",
|
130 |
+
dataset_name="mupe",
|
131 |
+
meta_file_train="metadata_coqui_bralemanha.csv",
|
132 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
133 |
+
language="bralemanha"
|
134 |
+
)
|
135 |
+
|
136 |
+
brgo_train_config = BaseDatasetConfig(
|
137 |
+
formatter="coqui",
|
138 |
+
dataset_name="mupe",
|
139 |
+
meta_file_train="metadata_coqui_brgo.csv",
|
140 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
141 |
+
language="brgo"
|
142 |
+
)
|
143 |
+
|
144 |
+
bral_train_config = BaseDatasetConfig(
|
145 |
+
formatter="coqui",
|
146 |
+
dataset_name="mupe",
|
147 |
+
meta_file_train="metadata_coqui_bral.csv",
|
148 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
149 |
+
language="bral"
|
150 |
+
)
|
151 |
+
|
152 |
+
brpr_train_config = BaseDatasetConfig(
|
153 |
+
formatter="coqui",
|
154 |
+
dataset_name="mupe",
|
155 |
+
meta_file_train="metadata_coqui_brpr.csv",
|
156 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
157 |
+
language="brpr"
|
158 |
+
)
|
159 |
+
|
160 |
+
bres_train_config = BaseDatasetConfig(
|
161 |
+
formatter="coqui",
|
162 |
+
dataset_name="mupe",
|
163 |
+
meta_file_train="metadata_coqui_bres.csv",
|
164 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
165 |
+
language="bres"
|
166 |
+
)
|
167 |
+
|
168 |
+
brpi_train_config = BaseDatasetConfig(
|
169 |
+
formatter="coqui",
|
170 |
+
dataset_name="mupe",
|
171 |
+
meta_file_train="metadata_coqui_brpi.csv",
|
172 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
173 |
+
language="brpi"
|
174 |
+
)
|
175 |
+
|
176 |
+
# bres_train_config, brpi_train_config no files found
|
177 |
+
DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
|
178 |
+
|
179 |
+
|
180 |
+
### Extract speaker embeddings
|
181 |
+
SPEAKER_ENCODER_CHECKPOINT_PATH = (
|
182 |
+
"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
|
183 |
+
)
|
184 |
+
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
|
185 |
+
|
186 |
+
D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
|
187 |
+
|
188 |
+
# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
|
189 |
+
for dataset_conf in DATASETS_CONFIG_LIST:
|
190 |
+
# Check if the embeddings weren't already computed, if not compute it
|
191 |
+
embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
|
192 |
+
if not os.path.isfile(embeddings_file):
|
193 |
+
print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
|
194 |
+
compute_embeddings(
|
195 |
+
SPEAKER_ENCODER_CHECKPOINT_PATH,
|
196 |
+
SPEAKER_ENCODER_CONFIG_PATH,
|
197 |
+
embeddings_file,
|
198 |
+
old_speakers_file=None,
|
199 |
+
config_dataset_path=None,
|
200 |
+
formatter_name=dataset_conf.formatter,
|
201 |
+
dataset_name=dataset_conf.dataset_name,
|
202 |
+
dataset_path=dataset_conf.path,
|
203 |
+
meta_file_train=dataset_conf.meta_file_train,
|
204 |
+
meta_file_val=dataset_conf.meta_file_val,
|
205 |
+
disable_cuda=False,
|
206 |
+
no_eval=False,
|
207 |
+
)
|
208 |
+
D_VECTOR_FILES.append(embeddings_file)
|
209 |
+
|
210 |
+
|
211 |
+
# Audio config used in training.
|
212 |
+
audio_config = VitsAudioConfig(
|
213 |
+
sample_rate=SAMPLE_RATE,
|
214 |
+
hop_length=256,
|
215 |
+
win_length=1024,
|
216 |
+
fft_size=1024,
|
217 |
+
mel_fmin=0.0,
|
218 |
+
mel_fmax=None,
|
219 |
+
num_mels=80,
|
220 |
+
)
|
221 |
+
|
222 |
+
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
223 |
+
model_args = VitsArgs(
|
224 |
+
spec_segment_size=62,
|
225 |
+
hidden_channels=192,
|
226 |
+
hidden_channels_ffn_text_encoder=768,
|
227 |
+
num_heads_text_encoder=2,
|
228 |
+
num_layers_text_encoder=10,
|
229 |
+
kernel_size_text_encoder=3,
|
230 |
+
dropout_p_text_encoder=0.1,
|
231 |
+
d_vector_file=D_VECTOR_FILES,
|
232 |
+
use_d_vector_file=True,
|
233 |
+
d_vector_dim=512,
|
234 |
+
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
235 |
+
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
236 |
+
resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
237 |
+
# Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
|
238 |
+
use_speaker_encoder_as_loss=False,
|
239 |
+
# Useful parameters to enable multilingual training
|
240 |
+
use_language_embedding=True,
|
241 |
+
embedded_language_dim=4,
|
242 |
+
use_adaptive_weight_text_encoder=False,
|
243 |
+
use_perfect_class_batch_sampler=True,
|
244 |
+
perfect_class_batch_sampler_key="language"
|
245 |
+
)
|
246 |
+
|
247 |
+
# General training config, here you can change the batch size and others useful parameters
|
248 |
+
config = VitsConfig(
|
249 |
+
output_path=OUT_PATH,
|
250 |
+
model_args=model_args,
|
251 |
+
run_name=RUN_NAME,
|
252 |
+
project_name="SYNTACC",
|
253 |
+
run_description="""
|
254 |
+
- YourTTS with SYNTACC text encoder
|
255 |
+
""",
|
256 |
+
dashboard_logger=DASHBOARD_LOGGER,
|
257 |
+
logger_uri=LOGGER_URI,
|
258 |
+
audio=audio_config,
|
259 |
+
batch_size=BATCH_SIZE,
|
260 |
+
batch_group_size=48,
|
261 |
+
eval_batch_size=BATCH_SIZE,
|
262 |
+
num_loader_workers=8,
|
263 |
+
eval_split_max_size=256,
|
264 |
+
print_step=50,
|
265 |
+
plot_step=100,
|
266 |
+
log_model_step=1000,
|
267 |
+
save_step=5000,
|
268 |
+
save_n_checkpoints=2,
|
269 |
+
save_checkpoints=True,
|
270 |
+
# target_loss="loss_1",
|
271 |
+
print_eval=False,
|
272 |
+
use_phonemes=False,
|
273 |
+
phonemizer="espeak",
|
274 |
+
phoneme_language="en",
|
275 |
+
compute_input_seq_cache=True,
|
276 |
+
add_blank=True,
|
277 |
+
text_cleaner="multilingual_cleaners",
|
278 |
+
characters=CharactersConfig(
|
279 |
+
characters_class="TTS.tts.models.vits.VitsCharacters",
|
280 |
+
pad="_",
|
281 |
+
eos="&",
|
282 |
+
bos="*",
|
283 |
+
blank=None,
|
284 |
+
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
285 |
+
punctuations="\u2014!'(),-.:;?\u00bf ",
|
286 |
+
phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
287 |
+
is_unique=True,
|
288 |
+
is_sorted=True,
|
289 |
+
),
|
290 |
+
phoneme_cache_path=None,
|
291 |
+
precompute_num_workers=12,
|
292 |
+
start_by_longest=True,
|
293 |
+
datasets=DATASETS_CONFIG_LIST,
|
294 |
+
cudnn_benchmark=False,
|
295 |
+
max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
|
296 |
+
mixed_precision=False,
|
297 |
+
test_sentences=[
|
298 |
+
#GUSTAVO: apenas pessoas do treino
|
299 |
+
["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
|
300 |
+
["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
|
301 |
+
["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
|
302 |
+
["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
|
303 |
+
["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
|
304 |
+
["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
|
305 |
+
["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
|
306 |
+
["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
|
307 |
+
["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
|
308 |
+
["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
|
309 |
+
["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
|
310 |
+
["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
|
311 |
+
["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
|
312 |
+
# ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
|
313 |
+
# ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
|
314 |
+
],
|
315 |
+
# Enable the weighted sampler
|
316 |
+
use_weighted_sampler=True,
|
317 |
+
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
318 |
+
# weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
|
319 |
+
weighted_sampler_attrs={"language": 1.0},
|
320 |
+
weighted_sampler_multipliers={
|
321 |
+
# "speaker_name": {
|
322 |
+
# you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
|
323 |
+
# It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
|
324 |
+
# The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
|
325 |
+
# 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
|
326 |
+
# }
|
327 |
+
},
|
328 |
+
# It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
|
329 |
+
speaker_encoder_loss_alpha=9.0,
|
330 |
+
)
|
331 |
+
|
332 |
+
# Load all the datasets samples and split traning and evaluation sets
|
333 |
+
train_samples, eval_samples = load_tts_samples(
|
334 |
+
config.datasets,
|
335 |
+
eval_split=True,
|
336 |
+
eval_split_max_size=config.eval_split_max_size,
|
337 |
+
eval_split_size=config.eval_split_size,
|
338 |
+
)
|
339 |
+
|
340 |
+
# Init the model
|
341 |
+
model = Vits.init_from_config(config)
|
342 |
+
|
343 |
+
# Init the trainer and 🚀
|
344 |
+
trainer = Trainer(
|
345 |
+
TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
|
346 |
+
config,
|
347 |
+
output_path=OUT_PATH,
|
348 |
+
model=model,
|
349 |
+
train_samples=train_samples,
|
350 |
+
eval_samples=eval_samples,
|
351 |
+
)
|
352 |
+
trainer.fit()
|
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/trainer_0_log.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ddf81cb4061c7e47bd824c3ebb109cc02bc31ab79ee21e4e69d60d32aca454b
|
3 |
+
size 1794644
|
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/best_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2cb1d72efa1724f811028b33a003492d486385a35846b2a09aae34ece757cbab
|
3 |
+
size 1044057134
|
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/best_model_78415.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2cb1d72efa1724f811028b33a003492d486385a35846b2a09aae34ece757cbab
|
3 |
+
size 1044057134
|
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_80000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5208e907be9e2db12b928d9c2b1abd4df0b757f34703f124db1a326449a882f2
|
3 |
+
size 1044057198
|
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f77b5c81d12b629d83cce93a9b0318eb1d41888e6e985706fa275841c92444d3
|
3 |
+
size 1044057198
|
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/config.json
ADDED
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"output_path": "/raid/datasets/MUPE/Experiments/runs",
|
3 |
+
"logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
|
4 |
+
"run_name": "YourTTS-Syntacc-PT",
|
5 |
+
"project_name": "SYNTACC",
|
6 |
+
"run_description": "\n - YourTTS with SYNTACC text encoder\n ",
|
7 |
+
"print_step": 50,
|
8 |
+
"plot_step": 100,
|
9 |
+
"model_param_stats": false,
|
10 |
+
"wandb_entity": null,
|
11 |
+
"dashboard_logger": "clearml",
|
12 |
+
"save_on_interrupt": true,
|
13 |
+
"log_model_step": 1000,
|
14 |
+
"save_step": 5000,
|
15 |
+
"save_n_checkpoints": 2,
|
16 |
+
"save_checkpoints": true,
|
17 |
+
"save_all_best": false,
|
18 |
+
"save_best_after": 10000,
|
19 |
+
"target_loss": null,
|
20 |
+
"print_eval": false,
|
21 |
+
"test_delay_epochs": 0,
|
22 |
+
"run_eval": true,
|
23 |
+
"run_eval_steps": null,
|
24 |
+
"distributed_backend": "nccl",
|
25 |
+
"distributed_url": "tcp://localhost:54321",
|
26 |
+
"mixed_precision": false,
|
27 |
+
"precision": "fp16",
|
28 |
+
"epochs": 1000,
|
29 |
+
"batch_size": 26,
|
30 |
+
"eval_batch_size": 26,
|
31 |
+
"grad_clip": [
|
32 |
+
1000,
|
33 |
+
1000
|
34 |
+
],
|
35 |
+
"scheduler_after_epoch": true,
|
36 |
+
"lr": 0.001,
|
37 |
+
"optimizer": "AdamW",
|
38 |
+
"optimizer_params": {
|
39 |
+
"betas": [
|
40 |
+
0.8,
|
41 |
+
0.99
|
42 |
+
],
|
43 |
+
"eps": 1e-09,
|
44 |
+
"weight_decay": 0.01
|
45 |
+
},
|
46 |
+
"lr_scheduler": null,
|
47 |
+
"lr_scheduler_params": {},
|
48 |
+
"use_grad_scaler": false,
|
49 |
+
"allow_tf32": false,
|
50 |
+
"cudnn_enable": true,
|
51 |
+
"cudnn_deterministic": false,
|
52 |
+
"cudnn_benchmark": false,
|
53 |
+
"training_seed": 54321,
|
54 |
+
"model": "vits",
|
55 |
+
"num_loader_workers": 8,
|
56 |
+
"num_eval_loader_workers": 0,
|
57 |
+
"use_noise_augment": false,
|
58 |
+
"audio": {
|
59 |
+
"fft_size": 1024,
|
60 |
+
"sample_rate": 16000,
|
61 |
+
"win_length": 1024,
|
62 |
+
"hop_length": 256,
|
63 |
+
"num_mels": 80,
|
64 |
+
"mel_fmin": 0.0,
|
65 |
+
"mel_fmax": null
|
66 |
+
},
|
67 |
+
"use_phonemes": false,
|
68 |
+
"phonemizer": "espeak",
|
69 |
+
"phoneme_language": "en",
|
70 |
+
"compute_input_seq_cache": true,
|
71 |
+
"text_cleaner": "multilingual_cleaners",
|
72 |
+
"enable_eos_bos_chars": false,
|
73 |
+
"test_sentences_file": "",
|
74 |
+
"phoneme_cache_path": null,
|
75 |
+
"characters": {
|
76 |
+
"characters_class": "TTS.tts.models.vits.VitsCharacters",
|
77 |
+
"vocab_dict": null,
|
78 |
+
"pad": "_",
|
79 |
+
"eos": "&",
|
80 |
+
"bos": "*",
|
81 |
+
"blank": null,
|
82 |
+
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
83 |
+
"punctuations": "\u2014!'(),-.:;?\u00bf ",
|
84 |
+
"phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
85 |
+
"is_unique": true,
|
86 |
+
"is_sorted": true
|
87 |
+
},
|
88 |
+
"add_blank": true,
|
89 |
+
"batch_group_size": 48,
|
90 |
+
"loss_masking": null,
|
91 |
+
"min_audio_len": 1,
|
92 |
+
"max_audio_len": Infinity,
|
93 |
+
"min_text_len": 1,
|
94 |
+
"max_text_len": Infinity,
|
95 |
+
"compute_f0": false,
|
96 |
+
"compute_energy": false,
|
97 |
+
"compute_linear_spec": true,
|
98 |
+
"precompute_num_workers": 12,
|
99 |
+
"start_by_longest": true,
|
100 |
+
"shuffle": false,
|
101 |
+
"drop_last": false,
|
102 |
+
"datasets": [
|
103 |
+
{
|
104 |
+
"formatter": "coqui",
|
105 |
+
"dataset_name": "mupe",
|
106 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
107 |
+
"meta_file_train": "metadata_coqui_brpb.csv",
|
108 |
+
"ignored_speakers": null,
|
109 |
+
"language": "brpb",
|
110 |
+
"phonemizer": "",
|
111 |
+
"meta_file_val": "",
|
112 |
+
"meta_file_attn_mask": ""
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"formatter": "coqui",
|
116 |
+
"dataset_name": "mupe",
|
117 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
118 |
+
"meta_file_train": "metadata_coqui_brba.csv",
|
119 |
+
"ignored_speakers": null,
|
120 |
+
"language": "brba",
|
121 |
+
"phonemizer": "",
|
122 |
+
"meta_file_val": "",
|
123 |
+
"meta_file_attn_mask": ""
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"formatter": "coqui",
|
127 |
+
"dataset_name": "mupe",
|
128 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
129 |
+
"meta_file_train": "metadata_coqui_brportugal.csv",
|
130 |
+
"ignored_speakers": null,
|
131 |
+
"language": "brportugal",
|
132 |
+
"phonemizer": "",
|
133 |
+
"meta_file_val": "",
|
134 |
+
"meta_file_attn_mask": ""
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"formatter": "coqui",
|
138 |
+
"dataset_name": "mupe",
|
139 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
140 |
+
"meta_file_train": "metadata_coqui_brsp.csv",
|
141 |
+
"ignored_speakers": null,
|
142 |
+
"language": "brsp",
|
143 |
+
"phonemizer": "",
|
144 |
+
"meta_file_val": "",
|
145 |
+
"meta_file_attn_mask": ""
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"formatter": "coqui",
|
149 |
+
"dataset_name": "mupe",
|
150 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
151 |
+
"meta_file_train": "metadata_coqui_brpe.csv",
|
152 |
+
"ignored_speakers": null,
|
153 |
+
"language": "brpe",
|
154 |
+
"phonemizer": "",
|
155 |
+
"meta_file_val": "",
|
156 |
+
"meta_file_attn_mask": ""
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"formatter": "coqui",
|
160 |
+
"dataset_name": "mupe",
|
161 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
162 |
+
"meta_file_train": "metadata_coqui_brmg.csv",
|
163 |
+
"ignored_speakers": null,
|
164 |
+
"language": "brmg",
|
165 |
+
"phonemizer": "",
|
166 |
+
"meta_file_val": "",
|
167 |
+
"meta_file_attn_mask": ""
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"formatter": "coqui",
|
171 |
+
"dataset_name": "mupe",
|
172 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
173 |
+
"meta_file_train": "metadata_coqui_brrj.csv",
|
174 |
+
"ignored_speakers": null,
|
175 |
+
"language": "brrj",
|
176 |
+
"phonemizer": "",
|
177 |
+
"meta_file_val": "",
|
178 |
+
"meta_file_attn_mask": ""
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"formatter": "coqui",
|
182 |
+
"dataset_name": "mupe",
|
183 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
184 |
+
"meta_file_train": "metadata_coqui_brce.csv",
|
185 |
+
"ignored_speakers": null,
|
186 |
+
"language": "brce",
|
187 |
+
"phonemizer": "",
|
188 |
+
"meta_file_val": "",
|
189 |
+
"meta_file_attn_mask": ""
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"formatter": "coqui",
|
193 |
+
"dataset_name": "mupe",
|
194 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
195 |
+
"meta_file_train": "metadata_coqui_brrs.csv",
|
196 |
+
"ignored_speakers": null,
|
197 |
+
"language": "brrs",
|
198 |
+
"phonemizer": "",
|
199 |
+
"meta_file_val": "",
|
200 |
+
"meta_file_attn_mask": ""
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"formatter": "coqui",
|
204 |
+
"dataset_name": "mupe",
|
205 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
206 |
+
"meta_file_train": "metadata_coqui_bralemanha.csv",
|
207 |
+
"ignored_speakers": null,
|
208 |
+
"language": "bralemanha",
|
209 |
+
"phonemizer": "",
|
210 |
+
"meta_file_val": "",
|
211 |
+
"meta_file_attn_mask": ""
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"formatter": "coqui",
|
215 |
+
"dataset_name": "mupe",
|
216 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
217 |
+
"meta_file_train": "metadata_coqui_brgo.csv",
|
218 |
+
"ignored_speakers": null,
|
219 |
+
"language": "brgo",
|
220 |
+
"phonemizer": "",
|
221 |
+
"meta_file_val": "",
|
222 |
+
"meta_file_attn_mask": ""
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"formatter": "coqui",
|
226 |
+
"dataset_name": "mupe",
|
227 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
228 |
+
"meta_file_train": "metadata_coqui_bral.csv",
|
229 |
+
"ignored_speakers": null,
|
230 |
+
"language": "bral",
|
231 |
+
"phonemizer": "",
|
232 |
+
"meta_file_val": "",
|
233 |
+
"meta_file_attn_mask": ""
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"formatter": "coqui",
|
237 |
+
"dataset_name": "mupe",
|
238 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
239 |
+
"meta_file_train": "metadata_coqui_brpr.csv",
|
240 |
+
"ignored_speakers": null,
|
241 |
+
"language": "brpr",
|
242 |
+
"phonemizer": "",
|
243 |
+
"meta_file_val": "",
|
244 |
+
"meta_file_attn_mask": ""
|
245 |
+
}
|
246 |
+
],
|
247 |
+
"test_sentences": [
|
248 |
+
[
|
249 |
+
"Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
|
250 |
+
"EDILEINE_FONSECA",
|
251 |
+
null,
|
252 |
+
"brsp"
|
253 |
+
],
|
254 |
+
[
|
255 |
+
"Quem semeia ventos, colhe tempestades.",
|
256 |
+
"JOSE_PAULO_DE_ARAUJO",
|
257 |
+
null,
|
258 |
+
"brpb"
|
259 |
+
],
|
260 |
+
[
|
261 |
+
"O olho do dono \u00e9 que engorda o gado.",
|
262 |
+
"VITOR_RAFAEL_OLIVEIRA_ALVES",
|
263 |
+
null,
|
264 |
+
"brba"
|
265 |
+
],
|
266 |
+
[
|
267 |
+
"\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
|
268 |
+
"MARIA_AURORA_FELIX",
|
269 |
+
null,
|
270 |
+
"brportugal"
|
271 |
+
],
|
272 |
+
[
|
273 |
+
"Quem espera sempre alcan\u00e7a.",
|
274 |
+
"ANTONIO_DE_AMORIM_COSTA",
|
275 |
+
null,
|
276 |
+
"brpe"
|
277 |
+
],
|
278 |
+
[
|
279 |
+
"Cada macaco no seu galho.",
|
280 |
+
"ALCIDES_DE_LIMA",
|
281 |
+
null,
|
282 |
+
"brmg"
|
283 |
+
],
|
284 |
+
[
|
285 |
+
"Em terra de cego, quem tem um olho \u00e9 rei.",
|
286 |
+
"ALUISIO_SOARES_DE_SOUSA",
|
287 |
+
null,
|
288 |
+
"brrj"
|
289 |
+
],
|
290 |
+
[
|
291 |
+
"A ocasi\u00e3o faz o ladr\u00e3o.",
|
292 |
+
"FRANCISCO_JOSE_MOREIRA_MOTA",
|
293 |
+
null,
|
294 |
+
"brce"
|
295 |
+
],
|
296 |
+
[
|
297 |
+
"De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
|
298 |
+
"EVALDO_ANDRADA_CORREA",
|
299 |
+
null,
|
300 |
+
"brrs"
|
301 |
+
],
|
302 |
+
[
|
303 |
+
"Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
|
304 |
+
"DORIS_ALEXANDER",
|
305 |
+
null,
|
306 |
+
"bralemanha"
|
307 |
+
],
|
308 |
+
[
|
309 |
+
"Quem n\u00e3o arrisca, n\u00e3o petisca.",
|
310 |
+
"DONALDO_LUIZ_DE_ALMEIDA",
|
311 |
+
null,
|
312 |
+
"brgo"
|
313 |
+
],
|
314 |
+
[
|
315 |
+
"A uni\u00e3o faz a for\u00e7a.",
|
316 |
+
"GERONCIO_HENRIQUE_NETO",
|
317 |
+
null,
|
318 |
+
"bral"
|
319 |
+
],
|
320 |
+
[
|
321 |
+
"Em boca fechada n\u00e3o entra mosquito.",
|
322 |
+
"MALU_NATEL_FREIRE_WEBER",
|
323 |
+
null,
|
324 |
+
"brpr"
|
325 |
+
]
|
326 |
+
],
|
327 |
+
"eval_split_max_size": 256,
|
328 |
+
"eval_split_size": 0.01,
|
329 |
+
"use_speaker_weighted_sampler": false,
|
330 |
+
"speaker_weighted_sampler_alpha": 1.0,
|
331 |
+
"use_language_weighted_sampler": false,
|
332 |
+
"language_weighted_sampler_alpha": 1.0,
|
333 |
+
"use_length_weighted_sampler": false,
|
334 |
+
"length_weighted_sampler_alpha": 1.0,
|
335 |
+
"model_args": {
|
336 |
+
"num_chars": 266,
|
337 |
+
"out_channels": 513,
|
338 |
+
"spec_segment_size": 62,
|
339 |
+
"hidden_channels": 192,
|
340 |
+
"use_adaptive_weight_text_encoder": true,
|
341 |
+
"use_perfect_class_batch_sampler": true,
|
342 |
+
"perfect_class_batch_sampler_key": "language",
|
343 |
+
"hidden_channels_ffn_text_encoder": 768,
|
344 |
+
"num_heads_text_encoder": 2,
|
345 |
+
"num_layers_text_encoder": 10,
|
346 |
+
"kernel_size_text_encoder": 3,
|
347 |
+
"dropout_p_text_encoder": 0.1,
|
348 |
+
"dropout_p_duration_predictor": 0.5,
|
349 |
+
"kernel_size_posterior_encoder": 5,
|
350 |
+
"dilation_rate_posterior_encoder": 1,
|
351 |
+
"num_layers_posterior_encoder": 16,
|
352 |
+
"kernel_size_flow": 5,
|
353 |
+
"dilation_rate_flow": 1,
|
354 |
+
"num_layers_flow": 4,
|
355 |
+
"resblock_type_decoder": "2",
|
356 |
+
"resblock_kernel_sizes_decoder": [
|
357 |
+
3,
|
358 |
+
7,
|
359 |
+
11
|
360 |
+
],
|
361 |
+
"resblock_dilation_sizes_decoder": [
|
362 |
+
[
|
363 |
+
1,
|
364 |
+
3,
|
365 |
+
5
|
366 |
+
],
|
367 |
+
[
|
368 |
+
1,
|
369 |
+
3,
|
370 |
+
5
|
371 |
+
],
|
372 |
+
[
|
373 |
+
1,
|
374 |
+
3,
|
375 |
+
5
|
376 |
+
]
|
377 |
+
],
|
378 |
+
"upsample_rates_decoder": [
|
379 |
+
8,
|
380 |
+
8,
|
381 |
+
2,
|
382 |
+
2
|
383 |
+
],
|
384 |
+
"upsample_initial_channel_decoder": 512,
|
385 |
+
"upsample_kernel_sizes_decoder": [
|
386 |
+
16,
|
387 |
+
16,
|
388 |
+
4,
|
389 |
+
4
|
390 |
+
],
|
391 |
+
"periods_multi_period_discriminator": [
|
392 |
+
2,
|
393 |
+
3,
|
394 |
+
5,
|
395 |
+
7,
|
396 |
+
11
|
397 |
+
],
|
398 |
+
"use_sdp": true,
|
399 |
+
"noise_scale": 1.0,
|
400 |
+
"inference_noise_scale": 0.667,
|
401 |
+
"length_scale": 1,
|
402 |
+
"noise_scale_dp": 1.0,
|
403 |
+
"inference_noise_scale_dp": 1.0,
|
404 |
+
"max_inference_len": null,
|
405 |
+
"init_discriminator": true,
|
406 |
+
"use_spectral_norm_disriminator": false,
|
407 |
+
"use_speaker_embedding": false,
|
408 |
+
"num_speakers": 0,
|
409 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/speakers.pth",
|
410 |
+
"d_vector_file": [
|
411 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
412 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
413 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
414 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
415 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
416 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
417 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
418 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
419 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
420 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
421 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
422 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
423 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
424 |
+
],
|
425 |
+
"speaker_embedding_channels": 256,
|
426 |
+
"use_d_vector_file": true,
|
427 |
+
"d_vector_dim": 512,
|
428 |
+
"detach_dp_input": true,
|
429 |
+
"use_language_embedding": false,
|
430 |
+
"embedded_language_dim": 4,
|
431 |
+
"num_languages": 0,
|
432 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/language_ids.json",
|
433 |
+
"use_speaker_encoder_as_loss": false,
|
434 |
+
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
435 |
+
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
436 |
+
"condition_dp_on_speaker": true,
|
437 |
+
"freeze_encoder": false,
|
438 |
+
"freeze_DP": false,
|
439 |
+
"freeze_PE": false,
|
440 |
+
"freeze_flow_decoder": false,
|
441 |
+
"freeze_waveform_decoder": false,
|
442 |
+
"encoder_sample_rate": null,
|
443 |
+
"interpolate_z": true,
|
444 |
+
"reinit_DP": false,
|
445 |
+
"reinit_text_encoder": false
|
446 |
+
},
|
447 |
+
"lr_gen": 0.0002,
|
448 |
+
"lr_disc": 0.0002,
|
449 |
+
"lr_scheduler_gen": "ExponentialLR",
|
450 |
+
"lr_scheduler_gen_params": {
|
451 |
+
"gamma": 0.999875,
|
452 |
+
"last_epoch": -1
|
453 |
+
},
|
454 |
+
"lr_scheduler_disc": "ExponentialLR",
|
455 |
+
"lr_scheduler_disc_params": {
|
456 |
+
"gamma": 0.999875,
|
457 |
+
"last_epoch": -1
|
458 |
+
},
|
459 |
+
"kl_loss_alpha": 1.0,
|
460 |
+
"disc_loss_alpha": 1.0,
|
461 |
+
"gen_loss_alpha": 1.0,
|
462 |
+
"feat_loss_alpha": 1.0,
|
463 |
+
"mel_loss_alpha": 45.0,
|
464 |
+
"dur_loss_alpha": 1.0,
|
465 |
+
"speaker_encoder_loss_alpha": 9.0,
|
466 |
+
"return_wav": true,
|
467 |
+
"use_weighted_sampler": true,
|
468 |
+
"weighted_sampler_attrs": {
|
469 |
+
"language": 1.0
|
470 |
+
},
|
471 |
+
"weighted_sampler_multipliers": {},
|
472 |
+
"r": 1,
|
473 |
+
"num_speakers": 0,
|
474 |
+
"use_speaker_embedding": false,
|
475 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/speakers.pth",
|
476 |
+
"speaker_embedding_channels": 256,
|
477 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/language_ids.json",
|
478 |
+
"use_language_embedding": false,
|
479 |
+
"use_d_vector_file": true,
|
480 |
+
"d_vector_file": [
|
481 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
482 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
483 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
484 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
485 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
486 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
487 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
488 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
489 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
490 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
491 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
492 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
493 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
494 |
+
],
|
495 |
+
"d_vector_dim": 512
|
496 |
+
}
|
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/language_ids.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bral": 0,
|
3 |
+
"bralemanha": 1,
|
4 |
+
"brba": 2,
|
5 |
+
"brce": 3,
|
6 |
+
"brgo": 4,
|
7 |
+
"brmg": 5,
|
8 |
+
"brpb": 6,
|
9 |
+
"brpe": 7,
|
10 |
+
"brportugal": 8,
|
11 |
+
"brpr": 9,
|
12 |
+
"brrj": 10,
|
13 |
+
"brrs": 11,
|
14 |
+
"brsp": 12
|
15 |
+
}
|
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/speakers.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
|
3 |
+
size 3296
|
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/train_syntacc.py
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from trainer import Trainer, TrainerArgs
|
5 |
+
|
6 |
+
from TTS.bin.compute_embeddings import compute_embeddings
|
7 |
+
from TTS.bin.resample import resample_files
|
8 |
+
from TTS.config.shared_configs import BaseDatasetConfig
|
9 |
+
from TTS.tts.configs.vits_config import VitsConfig
|
10 |
+
from TTS.tts.datasets import load_tts_samples
|
11 |
+
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
|
12 |
+
from TTS.utils.downloaders import download_libri_tts
|
13 |
+
from torch.utils.data import DataLoader
|
14 |
+
from TTS.utils.samplers import PerfectBatchSampler
|
15 |
+
torch.set_num_threads(24)
|
16 |
+
|
17 |
+
# pylint: disable=W0105
|
18 |
+
"""
|
19 |
+
This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
|
20 |
+
YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
|
21 |
+
"""
|
22 |
+
CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
|
23 |
+
|
24 |
+
# Name of the run for the Trainer
|
25 |
+
RUN_NAME = "YourTTS-Syntacc-PT"
|
26 |
+
|
27 |
+
# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
|
28 |
+
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
29 |
+
|
30 |
+
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
31 |
+
RESTORE_PATH = "/raid/edresson/dev/Paper/cml_tts/checkpoints_yourtts_cml_tts_dataset/best_model.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
32 |
+
|
33 |
+
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
34 |
+
SKIP_TRAIN_EPOCH = False
|
35 |
+
|
36 |
+
# Set here the batch size to be used in training and evaluation
|
37 |
+
BATCH_SIZE = 26
|
38 |
+
|
39 |
+
# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
|
40 |
+
# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
|
41 |
+
SAMPLE_RATE = 16000
|
42 |
+
|
43 |
+
|
44 |
+
DASHBOARD_LOGGER="tensorboard"
|
45 |
+
LOGGER_URI = None
|
46 |
+
|
47 |
+
DASHBOARD_LOGGER = "clearml"
|
48 |
+
LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
|
53 |
+
MAX_AUDIO_LEN_IN_SECONDS = float("inf")
|
54 |
+
|
55 |
+
# Define here the datasets config
|
56 |
+
brpb_train_config = BaseDatasetConfig(
|
57 |
+
formatter="coqui",
|
58 |
+
dataset_name="mupe",
|
59 |
+
meta_file_train="metadata_coqui_brpb.csv",
|
60 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
61 |
+
language="brpb"
|
62 |
+
)
|
63 |
+
|
64 |
+
brba_train_config = BaseDatasetConfig(
|
65 |
+
formatter="coqui",
|
66 |
+
dataset_name="mupe",
|
67 |
+
meta_file_train="metadata_coqui_brba.csv",
|
68 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
69 |
+
language="brba"
|
70 |
+
)
|
71 |
+
|
72 |
+
brportugal_train_config = BaseDatasetConfig(
|
73 |
+
formatter="coqui",
|
74 |
+
dataset_name="mupe",
|
75 |
+
meta_file_train="metadata_coqui_brportugal.csv",
|
76 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
77 |
+
language="brportugal"
|
78 |
+
)
|
79 |
+
|
80 |
+
brsp_train_config = BaseDatasetConfig(
|
81 |
+
formatter="coqui",
|
82 |
+
dataset_name="mupe",
|
83 |
+
meta_file_train="metadata_coqui_brsp.csv",
|
84 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
85 |
+
language="brsp"
|
86 |
+
)
|
87 |
+
|
88 |
+
brpe_train_config = BaseDatasetConfig(
|
89 |
+
formatter="coqui",
|
90 |
+
dataset_name="mupe",
|
91 |
+
meta_file_train="metadata_coqui_brpe.csv",
|
92 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
93 |
+
language="brpe"
|
94 |
+
)
|
95 |
+
|
96 |
+
brmg_train_config = BaseDatasetConfig(
|
97 |
+
formatter="coqui",
|
98 |
+
dataset_name="mupe",
|
99 |
+
meta_file_train="metadata_coqui_brmg.csv",
|
100 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
101 |
+
language="brmg"
|
102 |
+
)
|
103 |
+
|
104 |
+
brrj_train_config = BaseDatasetConfig(
|
105 |
+
formatter="coqui",
|
106 |
+
dataset_name="mupe",
|
107 |
+
meta_file_train="metadata_coqui_brrj.csv",
|
108 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
109 |
+
language="brrj"
|
110 |
+
)
|
111 |
+
|
112 |
+
brce_train_config = BaseDatasetConfig(
|
113 |
+
formatter="coqui",
|
114 |
+
dataset_name="mupe",
|
115 |
+
meta_file_train="metadata_coqui_brce.csv",
|
116 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
117 |
+
language="brce"
|
118 |
+
)
|
119 |
+
|
120 |
+
brrs_train_config = BaseDatasetConfig(
|
121 |
+
formatter="coqui",
|
122 |
+
dataset_name="mupe",
|
123 |
+
meta_file_train="metadata_coqui_brrs.csv",
|
124 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
125 |
+
language="brrs"
|
126 |
+
)
|
127 |
+
|
128 |
+
bralemanha_train_config = BaseDatasetConfig(
|
129 |
+
formatter="coqui",
|
130 |
+
dataset_name="mupe",
|
131 |
+
meta_file_train="metadata_coqui_bralemanha.csv",
|
132 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
133 |
+
language="bralemanha"
|
134 |
+
)
|
135 |
+
|
136 |
+
brgo_train_config = BaseDatasetConfig(
|
137 |
+
formatter="coqui",
|
138 |
+
dataset_name="mupe",
|
139 |
+
meta_file_train="metadata_coqui_brgo.csv",
|
140 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
141 |
+
language="brgo"
|
142 |
+
)
|
143 |
+
|
144 |
+
bral_train_config = BaseDatasetConfig(
|
145 |
+
formatter="coqui",
|
146 |
+
dataset_name="mupe",
|
147 |
+
meta_file_train="metadata_coqui_bral.csv",
|
148 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
149 |
+
language="bral"
|
150 |
+
)
|
151 |
+
|
152 |
+
brpr_train_config = BaseDatasetConfig(
|
153 |
+
formatter="coqui",
|
154 |
+
dataset_name="mupe",
|
155 |
+
meta_file_train="metadata_coqui_brpr.csv",
|
156 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
157 |
+
language="brpr"
|
158 |
+
)
|
159 |
+
|
160 |
+
bres_train_config = BaseDatasetConfig(
|
161 |
+
formatter="coqui",
|
162 |
+
dataset_name="mupe",
|
163 |
+
meta_file_train="metadata_coqui_bres.csv",
|
164 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
165 |
+
language="bres"
|
166 |
+
)
|
167 |
+
|
168 |
+
brpi_train_config = BaseDatasetConfig(
|
169 |
+
formatter="coqui",
|
170 |
+
dataset_name="mupe",
|
171 |
+
meta_file_train="metadata_coqui_brpi.csv",
|
172 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
173 |
+
language="brpi"
|
174 |
+
)
|
175 |
+
|
176 |
+
# bres_train_config, brpi_train_config no files found
|
177 |
+
DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
|
178 |
+
|
179 |
+
|
180 |
+
### Extract speaker embeddings
|
181 |
+
SPEAKER_ENCODER_CHECKPOINT_PATH = (
|
182 |
+
"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
|
183 |
+
)
|
184 |
+
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
|
185 |
+
|
186 |
+
D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
|
187 |
+
|
188 |
+
# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
|
189 |
+
for dataset_conf in DATASETS_CONFIG_LIST:
|
190 |
+
# Check if the embeddings weren't already computed, if not compute it
|
191 |
+
embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
|
192 |
+
if not os.path.isfile(embeddings_file):
|
193 |
+
print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
|
194 |
+
compute_embeddings(
|
195 |
+
SPEAKER_ENCODER_CHECKPOINT_PATH,
|
196 |
+
SPEAKER_ENCODER_CONFIG_PATH,
|
197 |
+
embeddings_file,
|
198 |
+
old_speakers_file=None,
|
199 |
+
config_dataset_path=None,
|
200 |
+
formatter_name=dataset_conf.formatter,
|
201 |
+
dataset_name=dataset_conf.dataset_name,
|
202 |
+
dataset_path=dataset_conf.path,
|
203 |
+
meta_file_train=dataset_conf.meta_file_train,
|
204 |
+
meta_file_val=dataset_conf.meta_file_val,
|
205 |
+
disable_cuda=False,
|
206 |
+
no_eval=False,
|
207 |
+
)
|
208 |
+
D_VECTOR_FILES.append(embeddings_file)
|
209 |
+
|
210 |
+
|
211 |
+
# Audio config used in training.
|
212 |
+
audio_config = VitsAudioConfig(
|
213 |
+
sample_rate=SAMPLE_RATE,
|
214 |
+
hop_length=256,
|
215 |
+
win_length=1024,
|
216 |
+
fft_size=1024,
|
217 |
+
mel_fmin=0.0,
|
218 |
+
mel_fmax=None,
|
219 |
+
num_mels=80,
|
220 |
+
)
|
221 |
+
|
222 |
+
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
223 |
+
model_args = VitsArgs(
|
224 |
+
spec_segment_size=62,
|
225 |
+
hidden_channels=192,
|
226 |
+
hidden_channels_ffn_text_encoder=768,
|
227 |
+
num_heads_text_encoder=2,
|
228 |
+
num_layers_text_encoder=10,
|
229 |
+
kernel_size_text_encoder=3,
|
230 |
+
dropout_p_text_encoder=0.1,
|
231 |
+
d_vector_file=D_VECTOR_FILES,
|
232 |
+
use_d_vector_file=True,
|
233 |
+
d_vector_dim=512,
|
234 |
+
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
235 |
+
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
236 |
+
resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
237 |
+
# Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
|
238 |
+
use_speaker_encoder_as_loss=False,
|
239 |
+
# Useful parameters to enable multilingual training
|
240 |
+
use_language_embedding=False,
|
241 |
+
embedded_language_dim=4,
|
242 |
+
use_adaptive_weight_text_encoder=True,
|
243 |
+
use_perfect_class_batch_sampler=True,
|
244 |
+
perfect_class_batch_sampler_key="language"
|
245 |
+
)
|
246 |
+
|
247 |
+
# General training config, here you can change the batch size and others useful parameters
|
248 |
+
config = VitsConfig(
|
249 |
+
output_path=OUT_PATH,
|
250 |
+
model_args=model_args,
|
251 |
+
run_name=RUN_NAME,
|
252 |
+
project_name="SYNTACC",
|
253 |
+
run_description="""
|
254 |
+
- YourTTS with SYNTACC text encoder
|
255 |
+
""",
|
256 |
+
dashboard_logger=DASHBOARD_LOGGER,
|
257 |
+
logger_uri=LOGGER_URI,
|
258 |
+
audio=audio_config,
|
259 |
+
batch_size=BATCH_SIZE,
|
260 |
+
batch_group_size=48,
|
261 |
+
eval_batch_size=BATCH_SIZE,
|
262 |
+
num_loader_workers=8,
|
263 |
+
eval_split_max_size=256,
|
264 |
+
print_step=50,
|
265 |
+
plot_step=100,
|
266 |
+
log_model_step=1000,
|
267 |
+
save_step=5000,
|
268 |
+
save_n_checkpoints=2,
|
269 |
+
save_checkpoints=True,
|
270 |
+
# target_loss="loss_1",
|
271 |
+
print_eval=False,
|
272 |
+
use_phonemes=False,
|
273 |
+
phonemizer="espeak",
|
274 |
+
phoneme_language="en",
|
275 |
+
compute_input_seq_cache=True,
|
276 |
+
add_blank=True,
|
277 |
+
text_cleaner="multilingual_cleaners",
|
278 |
+
characters=CharactersConfig(
|
279 |
+
characters_class="TTS.tts.models.vits.VitsCharacters",
|
280 |
+
pad="_",
|
281 |
+
eos="&",
|
282 |
+
bos="*",
|
283 |
+
blank=None,
|
284 |
+
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
285 |
+
punctuations="\u2014!'(),-.:;?\u00bf ",
|
286 |
+
phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
287 |
+
is_unique=True,
|
288 |
+
is_sorted=True,
|
289 |
+
),
|
290 |
+
phoneme_cache_path=None,
|
291 |
+
precompute_num_workers=12,
|
292 |
+
start_by_longest=True,
|
293 |
+
datasets=DATASETS_CONFIG_LIST,
|
294 |
+
cudnn_benchmark=False,
|
295 |
+
max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
|
296 |
+
mixed_precision=False,
|
297 |
+
test_sentences=[
|
298 |
+
#GUSTAVO: apenas pessoas do treino
|
299 |
+
["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
|
300 |
+
["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
|
301 |
+
["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
|
302 |
+
["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
|
303 |
+
["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
|
304 |
+
["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
|
305 |
+
["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
|
306 |
+
["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
|
307 |
+
["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
|
308 |
+
["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
|
309 |
+
["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
|
310 |
+
["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
|
311 |
+
["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
|
312 |
+
# ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
|
313 |
+
# ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
|
314 |
+
],
|
315 |
+
# Enable the weighted sampler
|
316 |
+
use_weighted_sampler=True,
|
317 |
+
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
318 |
+
# weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
|
319 |
+
weighted_sampler_attrs={"language": 1.0},
|
320 |
+
weighted_sampler_multipliers={
|
321 |
+
# "speaker_name": {
|
322 |
+
# you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
|
323 |
+
# It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
|
324 |
+
# The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
|
325 |
+
# 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
|
326 |
+
# }
|
327 |
+
},
|
328 |
+
# It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
|
329 |
+
speaker_encoder_loss_alpha=9.0,
|
330 |
+
)
|
331 |
+
|
332 |
+
# Load all the datasets samples and split traning and evaluation sets
|
333 |
+
train_samples, eval_samples = load_tts_samples(
|
334 |
+
config.datasets,
|
335 |
+
eval_split=True,
|
336 |
+
eval_split_max_size=config.eval_split_max_size,
|
337 |
+
eval_split_size=config.eval_split_size,
|
338 |
+
)
|
339 |
+
|
340 |
+
# Init the model
|
341 |
+
model = Vits.init_from_config(config)
|
342 |
+
|
343 |
+
# Init the trainer and 🚀
|
344 |
+
trainer = Trainer(
|
345 |
+
TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
|
346 |
+
config,
|
347 |
+
output_path=OUT_PATH,
|
348 |
+
model=model,
|
349 |
+
train_samples=train_samples,
|
350 |
+
eval_samples=eval_samples,
|
351 |
+
)
|
352 |
+
trainer.fit()
|
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/trainer_0_log.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:92bf84450b9ef1865a5f553a00c5d3649069dd6b17b314e548a429a52a8a9f3f
|
3 |
+
size 1423682
|
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/best_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cdcbc348b4c18b558e8d8b5409027adf5897da1fce86b72795aaaf3635d3cb90
|
3 |
+
size 1044057262
|
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/best_model_87818.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cdcbc348b4c18b558e8d8b5409027adf5897da1fce86b72795aaaf3635d3cb90
|
3 |
+
size 1044057262
|
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/config.json
ADDED
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"output_path": "/raid/datasets/MUPE/Experiments/runs",
|
3 |
+
"logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
|
4 |
+
"run_name": "YourTTS-Syntacc-PT",
|
5 |
+
"project_name": "SYNTACC",
|
6 |
+
"run_description": "\n - YourTTS with SYNTACC text encoder\n ",
|
7 |
+
"print_step": 50,
|
8 |
+
"plot_step": 100,
|
9 |
+
"model_param_stats": false,
|
10 |
+
"wandb_entity": null,
|
11 |
+
"dashboard_logger": "clearml",
|
12 |
+
"save_on_interrupt": true,
|
13 |
+
"log_model_step": 1000,
|
14 |
+
"save_step": 5000,
|
15 |
+
"save_n_checkpoints": 2,
|
16 |
+
"save_checkpoints": true,
|
17 |
+
"save_all_best": false,
|
18 |
+
"save_best_after": 10000,
|
19 |
+
"target_loss": null,
|
20 |
+
"print_eval": false,
|
21 |
+
"test_delay_epochs": 0,
|
22 |
+
"run_eval": true,
|
23 |
+
"run_eval_steps": null,
|
24 |
+
"distributed_backend": "nccl",
|
25 |
+
"distributed_url": "tcp://localhost:54321",
|
26 |
+
"mixed_precision": false,
|
27 |
+
"precision": "fp16",
|
28 |
+
"epochs": 1000,
|
29 |
+
"batch_size": 26,
|
30 |
+
"eval_batch_size": 26,
|
31 |
+
"grad_clip": [
|
32 |
+
1000,
|
33 |
+
1000
|
34 |
+
],
|
35 |
+
"scheduler_after_epoch": true,
|
36 |
+
"lr": 0.001,
|
37 |
+
"optimizer": "AdamW",
|
38 |
+
"optimizer_params": {
|
39 |
+
"betas": [
|
40 |
+
0.8,
|
41 |
+
0.99
|
42 |
+
],
|
43 |
+
"eps": 1e-09,
|
44 |
+
"weight_decay": 0.01
|
45 |
+
},
|
46 |
+
"lr_scheduler": null,
|
47 |
+
"lr_scheduler_params": {},
|
48 |
+
"use_grad_scaler": false,
|
49 |
+
"allow_tf32": false,
|
50 |
+
"cudnn_enable": true,
|
51 |
+
"cudnn_deterministic": false,
|
52 |
+
"cudnn_benchmark": false,
|
53 |
+
"training_seed": 54321,
|
54 |
+
"model": "vits",
|
55 |
+
"num_loader_workers": 8,
|
56 |
+
"num_eval_loader_workers": 0,
|
57 |
+
"use_noise_augment": false,
|
58 |
+
"audio": {
|
59 |
+
"fft_size": 1024,
|
60 |
+
"sample_rate": 16000,
|
61 |
+
"win_length": 1024,
|
62 |
+
"hop_length": 256,
|
63 |
+
"num_mels": 80,
|
64 |
+
"mel_fmin": 0.0,
|
65 |
+
"mel_fmax": null
|
66 |
+
},
|
67 |
+
"use_phonemes": false,
|
68 |
+
"phonemizer": "espeak",
|
69 |
+
"phoneme_language": "en",
|
70 |
+
"compute_input_seq_cache": true,
|
71 |
+
"text_cleaner": "multilingual_cleaners",
|
72 |
+
"enable_eos_bos_chars": false,
|
73 |
+
"test_sentences_file": "",
|
74 |
+
"phoneme_cache_path": null,
|
75 |
+
"characters": {
|
76 |
+
"characters_class": "TTS.tts.models.vits.VitsCharacters",
|
77 |
+
"vocab_dict": null,
|
78 |
+
"pad": "_",
|
79 |
+
"eos": "&",
|
80 |
+
"bos": "*",
|
81 |
+
"blank": null,
|
82 |
+
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
83 |
+
"punctuations": "\u2014!'(),-.:;?\u00bf ",
|
84 |
+
"phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
85 |
+
"is_unique": true,
|
86 |
+
"is_sorted": true
|
87 |
+
},
|
88 |
+
"add_blank": true,
|
89 |
+
"batch_group_size": 48,
|
90 |
+
"loss_masking": null,
|
91 |
+
"min_audio_len": 1,
|
92 |
+
"max_audio_len": Infinity,
|
93 |
+
"min_text_len": 1,
|
94 |
+
"max_text_len": Infinity,
|
95 |
+
"compute_f0": false,
|
96 |
+
"compute_energy": false,
|
97 |
+
"compute_linear_spec": true,
|
98 |
+
"precompute_num_workers": 12,
|
99 |
+
"start_by_longest": true,
|
100 |
+
"shuffle": false,
|
101 |
+
"drop_last": false,
|
102 |
+
"datasets": [
|
103 |
+
{
|
104 |
+
"formatter": "coqui",
|
105 |
+
"dataset_name": "mupe",
|
106 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
107 |
+
"meta_file_train": "metadata_coqui_brpb.csv",
|
108 |
+
"ignored_speakers": null,
|
109 |
+
"language": "brpb",
|
110 |
+
"phonemizer": "",
|
111 |
+
"meta_file_val": "",
|
112 |
+
"meta_file_attn_mask": ""
|
113 |
+
},
|
114 |
+
{
|
115 |
+
"formatter": "coqui",
|
116 |
+
"dataset_name": "mupe",
|
117 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
118 |
+
"meta_file_train": "metadata_coqui_brba.csv",
|
119 |
+
"ignored_speakers": null,
|
120 |
+
"language": "brba",
|
121 |
+
"phonemizer": "",
|
122 |
+
"meta_file_val": "",
|
123 |
+
"meta_file_attn_mask": ""
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"formatter": "coqui",
|
127 |
+
"dataset_name": "mupe",
|
128 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
129 |
+
"meta_file_train": "metadata_coqui_brportugal.csv",
|
130 |
+
"ignored_speakers": null,
|
131 |
+
"language": "brportugal",
|
132 |
+
"phonemizer": "",
|
133 |
+
"meta_file_val": "",
|
134 |
+
"meta_file_attn_mask": ""
|
135 |
+
},
|
136 |
+
{
|
137 |
+
"formatter": "coqui",
|
138 |
+
"dataset_name": "mupe",
|
139 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
140 |
+
"meta_file_train": "metadata_coqui_brsp.csv",
|
141 |
+
"ignored_speakers": null,
|
142 |
+
"language": "brsp",
|
143 |
+
"phonemizer": "",
|
144 |
+
"meta_file_val": "",
|
145 |
+
"meta_file_attn_mask": ""
|
146 |
+
},
|
147 |
+
{
|
148 |
+
"formatter": "coqui",
|
149 |
+
"dataset_name": "mupe",
|
150 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
151 |
+
"meta_file_train": "metadata_coqui_brpe.csv",
|
152 |
+
"ignored_speakers": null,
|
153 |
+
"language": "brpe",
|
154 |
+
"phonemizer": "",
|
155 |
+
"meta_file_val": "",
|
156 |
+
"meta_file_attn_mask": ""
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"formatter": "coqui",
|
160 |
+
"dataset_name": "mupe",
|
161 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
162 |
+
"meta_file_train": "metadata_coqui_brmg.csv",
|
163 |
+
"ignored_speakers": null,
|
164 |
+
"language": "brmg",
|
165 |
+
"phonemizer": "",
|
166 |
+
"meta_file_val": "",
|
167 |
+
"meta_file_attn_mask": ""
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"formatter": "coqui",
|
171 |
+
"dataset_name": "mupe",
|
172 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
173 |
+
"meta_file_train": "metadata_coqui_brrj.csv",
|
174 |
+
"ignored_speakers": null,
|
175 |
+
"language": "brrj",
|
176 |
+
"phonemizer": "",
|
177 |
+
"meta_file_val": "",
|
178 |
+
"meta_file_attn_mask": ""
|
179 |
+
},
|
180 |
+
{
|
181 |
+
"formatter": "coqui",
|
182 |
+
"dataset_name": "mupe",
|
183 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
184 |
+
"meta_file_train": "metadata_coqui_brce.csv",
|
185 |
+
"ignored_speakers": null,
|
186 |
+
"language": "brce",
|
187 |
+
"phonemizer": "",
|
188 |
+
"meta_file_val": "",
|
189 |
+
"meta_file_attn_mask": ""
|
190 |
+
},
|
191 |
+
{
|
192 |
+
"formatter": "coqui",
|
193 |
+
"dataset_name": "mupe",
|
194 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
195 |
+
"meta_file_train": "metadata_coqui_brrs.csv",
|
196 |
+
"ignored_speakers": null,
|
197 |
+
"language": "brrs",
|
198 |
+
"phonemizer": "",
|
199 |
+
"meta_file_val": "",
|
200 |
+
"meta_file_attn_mask": ""
|
201 |
+
},
|
202 |
+
{
|
203 |
+
"formatter": "coqui",
|
204 |
+
"dataset_name": "mupe",
|
205 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
206 |
+
"meta_file_train": "metadata_coqui_bralemanha.csv",
|
207 |
+
"ignored_speakers": null,
|
208 |
+
"language": "bralemanha",
|
209 |
+
"phonemizer": "",
|
210 |
+
"meta_file_val": "",
|
211 |
+
"meta_file_attn_mask": ""
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"formatter": "coqui",
|
215 |
+
"dataset_name": "mupe",
|
216 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
217 |
+
"meta_file_train": "metadata_coqui_brgo.csv",
|
218 |
+
"ignored_speakers": null,
|
219 |
+
"language": "brgo",
|
220 |
+
"phonemizer": "",
|
221 |
+
"meta_file_val": "",
|
222 |
+
"meta_file_attn_mask": ""
|
223 |
+
},
|
224 |
+
{
|
225 |
+
"formatter": "coqui",
|
226 |
+
"dataset_name": "mupe",
|
227 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
228 |
+
"meta_file_train": "metadata_coqui_bral.csv",
|
229 |
+
"ignored_speakers": null,
|
230 |
+
"language": "bral",
|
231 |
+
"phonemizer": "",
|
232 |
+
"meta_file_val": "",
|
233 |
+
"meta_file_attn_mask": ""
|
234 |
+
},
|
235 |
+
{
|
236 |
+
"formatter": "coqui",
|
237 |
+
"dataset_name": "mupe",
|
238 |
+
"path": "/raid/datasets/MUPE/dataset/mupe/",
|
239 |
+
"meta_file_train": "metadata_coqui_brpr.csv",
|
240 |
+
"ignored_speakers": null,
|
241 |
+
"language": "brpr",
|
242 |
+
"phonemizer": "",
|
243 |
+
"meta_file_val": "",
|
244 |
+
"meta_file_attn_mask": ""
|
245 |
+
}
|
246 |
+
],
|
247 |
+
"test_sentences": [
|
248 |
+
[
|
249 |
+
"Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
|
250 |
+
"EDILEINE_FONSECA",
|
251 |
+
null,
|
252 |
+
"brsp"
|
253 |
+
],
|
254 |
+
[
|
255 |
+
"Quem semeia ventos, colhe tempestades.",
|
256 |
+
"JOSE_PAULO_DE_ARAUJO",
|
257 |
+
null,
|
258 |
+
"brpb"
|
259 |
+
],
|
260 |
+
[
|
261 |
+
"O olho do dono \u00e9 que engorda o gado.",
|
262 |
+
"VITOR_RAFAEL_OLIVEIRA_ALVES",
|
263 |
+
null,
|
264 |
+
"brba"
|
265 |
+
],
|
266 |
+
[
|
267 |
+
"\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
|
268 |
+
"MARIA_AURORA_FELIX",
|
269 |
+
null,
|
270 |
+
"brportugal"
|
271 |
+
],
|
272 |
+
[
|
273 |
+
"Quem espera sempre alcan\u00e7a.",
|
274 |
+
"ANTONIO_DE_AMORIM_COSTA",
|
275 |
+
null,
|
276 |
+
"brpe"
|
277 |
+
],
|
278 |
+
[
|
279 |
+
"Cada macaco no seu galho.",
|
280 |
+
"ALCIDES_DE_LIMA",
|
281 |
+
null,
|
282 |
+
"brmg"
|
283 |
+
],
|
284 |
+
[
|
285 |
+
"Em terra de cego, quem tem um olho \u00e9 rei.",
|
286 |
+
"ALUISIO_SOARES_DE_SOUSA",
|
287 |
+
null,
|
288 |
+
"brrj"
|
289 |
+
],
|
290 |
+
[
|
291 |
+
"A ocasi\u00e3o faz o ladr\u00e3o.",
|
292 |
+
"FRANCISCO_JOSE_MOREIRA_MOTA",
|
293 |
+
null,
|
294 |
+
"brce"
|
295 |
+
],
|
296 |
+
[
|
297 |
+
"De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
|
298 |
+
"EVALDO_ANDRADA_CORREA",
|
299 |
+
null,
|
300 |
+
"brrs"
|
301 |
+
],
|
302 |
+
[
|
303 |
+
"Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
|
304 |
+
"DORIS_ALEXANDER",
|
305 |
+
null,
|
306 |
+
"bralemanha"
|
307 |
+
],
|
308 |
+
[
|
309 |
+
"Quem n\u00e3o arrisca, n\u00e3o petisca.",
|
310 |
+
"DONALDO_LUIZ_DE_ALMEIDA",
|
311 |
+
null,
|
312 |
+
"brgo"
|
313 |
+
],
|
314 |
+
[
|
315 |
+
"A uni\u00e3o faz a for\u00e7a.",
|
316 |
+
"GERONCIO_HENRIQUE_NETO",
|
317 |
+
null,
|
318 |
+
"bral"
|
319 |
+
],
|
320 |
+
[
|
321 |
+
"Em boca fechada n\u00e3o entra mosquito.",
|
322 |
+
"MALU_NATEL_FREIRE_WEBER",
|
323 |
+
null,
|
324 |
+
"brpr"
|
325 |
+
]
|
326 |
+
],
|
327 |
+
"eval_split_max_size": 256,
|
328 |
+
"eval_split_size": 0.01,
|
329 |
+
"use_speaker_weighted_sampler": false,
|
330 |
+
"speaker_weighted_sampler_alpha": 1.0,
|
331 |
+
"use_language_weighted_sampler": false,
|
332 |
+
"language_weighted_sampler_alpha": 1.0,
|
333 |
+
"use_length_weighted_sampler": false,
|
334 |
+
"length_weighted_sampler_alpha": 1.0,
|
335 |
+
"model_args": {
|
336 |
+
"num_chars": 266,
|
337 |
+
"out_channels": 513,
|
338 |
+
"spec_segment_size": 62,
|
339 |
+
"hidden_channels": 192,
|
340 |
+
"use_adaptive_weight_text_encoder": true,
|
341 |
+
"use_perfect_class_batch_sampler": true,
|
342 |
+
"perfect_class_batch_sampler_key": "language",
|
343 |
+
"hidden_channels_ffn_text_encoder": 768,
|
344 |
+
"num_heads_text_encoder": 2,
|
345 |
+
"num_layers_text_encoder": 10,
|
346 |
+
"kernel_size_text_encoder": 3,
|
347 |
+
"dropout_p_text_encoder": 0.1,
|
348 |
+
"dropout_p_duration_predictor": 0.5,
|
349 |
+
"kernel_size_posterior_encoder": 5,
|
350 |
+
"dilation_rate_posterior_encoder": 1,
|
351 |
+
"num_layers_posterior_encoder": 16,
|
352 |
+
"kernel_size_flow": 5,
|
353 |
+
"dilation_rate_flow": 1,
|
354 |
+
"num_layers_flow": 4,
|
355 |
+
"resblock_type_decoder": "2",
|
356 |
+
"resblock_kernel_sizes_decoder": [
|
357 |
+
3,
|
358 |
+
7,
|
359 |
+
11
|
360 |
+
],
|
361 |
+
"resblock_dilation_sizes_decoder": [
|
362 |
+
[
|
363 |
+
1,
|
364 |
+
3,
|
365 |
+
5
|
366 |
+
],
|
367 |
+
[
|
368 |
+
1,
|
369 |
+
3,
|
370 |
+
5
|
371 |
+
],
|
372 |
+
[
|
373 |
+
1,
|
374 |
+
3,
|
375 |
+
5
|
376 |
+
]
|
377 |
+
],
|
378 |
+
"upsample_rates_decoder": [
|
379 |
+
8,
|
380 |
+
8,
|
381 |
+
2,
|
382 |
+
2
|
383 |
+
],
|
384 |
+
"upsample_initial_channel_decoder": 512,
|
385 |
+
"upsample_kernel_sizes_decoder": [
|
386 |
+
16,
|
387 |
+
16,
|
388 |
+
4,
|
389 |
+
4
|
390 |
+
],
|
391 |
+
"periods_multi_period_discriminator": [
|
392 |
+
2,
|
393 |
+
3,
|
394 |
+
5,
|
395 |
+
7,
|
396 |
+
11
|
397 |
+
],
|
398 |
+
"use_sdp": true,
|
399 |
+
"noise_scale": 1.0,
|
400 |
+
"inference_noise_scale": 0.667,
|
401 |
+
"length_scale": 1,
|
402 |
+
"noise_scale_dp": 1.0,
|
403 |
+
"inference_noise_scale_dp": 1.0,
|
404 |
+
"max_inference_len": null,
|
405 |
+
"init_discriminator": true,
|
406 |
+
"use_spectral_norm_disriminator": false,
|
407 |
+
"use_speaker_embedding": false,
|
408 |
+
"num_speakers": 0,
|
409 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/speakers.pth",
|
410 |
+
"d_vector_file": [
|
411 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
412 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
413 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
414 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
415 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
416 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
417 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
418 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
419 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
420 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
421 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
422 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
423 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
424 |
+
],
|
425 |
+
"speaker_embedding_channels": 256,
|
426 |
+
"use_d_vector_file": true,
|
427 |
+
"d_vector_dim": 512,
|
428 |
+
"detach_dp_input": true,
|
429 |
+
"use_language_embedding": false,
|
430 |
+
"embedded_language_dim": 4,
|
431 |
+
"num_languages": 0,
|
432 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/language_ids.json",
|
433 |
+
"use_speaker_encoder_as_loss": false,
|
434 |
+
"speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
|
435 |
+
"speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
|
436 |
+
"condition_dp_on_speaker": true,
|
437 |
+
"freeze_encoder": false,
|
438 |
+
"freeze_DP": false,
|
439 |
+
"freeze_PE": false,
|
440 |
+
"freeze_flow_decoder": false,
|
441 |
+
"freeze_waveform_decoder": false,
|
442 |
+
"encoder_sample_rate": null,
|
443 |
+
"interpolate_z": true,
|
444 |
+
"reinit_DP": false,
|
445 |
+
"reinit_text_encoder": false
|
446 |
+
},
|
447 |
+
"lr_gen": 0.0002,
|
448 |
+
"lr_disc": 0.0002,
|
449 |
+
"lr_scheduler_gen": "ExponentialLR",
|
450 |
+
"lr_scheduler_gen_params": {
|
451 |
+
"gamma": 0.999875,
|
452 |
+
"last_epoch": -1
|
453 |
+
},
|
454 |
+
"lr_scheduler_disc": "ExponentialLR",
|
455 |
+
"lr_scheduler_disc_params": {
|
456 |
+
"gamma": 0.999875,
|
457 |
+
"last_epoch": -1
|
458 |
+
},
|
459 |
+
"kl_loss_alpha": 1.0,
|
460 |
+
"disc_loss_alpha": 1.0,
|
461 |
+
"gen_loss_alpha": 1.0,
|
462 |
+
"feat_loss_alpha": 1.0,
|
463 |
+
"mel_loss_alpha": 45.0,
|
464 |
+
"dur_loss_alpha": 1.0,
|
465 |
+
"speaker_encoder_loss_alpha": 9.0,
|
466 |
+
"return_wav": true,
|
467 |
+
"use_weighted_sampler": true,
|
468 |
+
"weighted_sampler_attrs": {
|
469 |
+
"language": 1.0
|
470 |
+
},
|
471 |
+
"weighted_sampler_multipliers": {},
|
472 |
+
"r": 1,
|
473 |
+
"num_speakers": 0,
|
474 |
+
"use_speaker_embedding": false,
|
475 |
+
"speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/speakers.pth",
|
476 |
+
"speaker_embedding_channels": 256,
|
477 |
+
"language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/language_ids.json",
|
478 |
+
"use_language_embedding": false,
|
479 |
+
"use_d_vector_file": true,
|
480 |
+
"d_vector_file": [
|
481 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
|
482 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
|
483 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
|
484 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
|
485 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
|
486 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
|
487 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
|
488 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
|
489 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
|
490 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
|
491 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
|
492 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
|
493 |
+
"/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
|
494 |
+
],
|
495 |
+
"d_vector_dim": 512
|
496 |
+
}
|
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/language_ids.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bral": 0,
|
3 |
+
"bralemanha": 1,
|
4 |
+
"brba": 2,
|
5 |
+
"brce": 3,
|
6 |
+
"brgo": 4,
|
7 |
+
"brmg": 5,
|
8 |
+
"brpb": 6,
|
9 |
+
"brpe": 7,
|
10 |
+
"brportugal": 8,
|
11 |
+
"brpr": 9,
|
12 |
+
"brrj": 10,
|
13 |
+
"brrs": 11,
|
14 |
+
"brsp": 12
|
15 |
+
}
|
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/speakers.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
|
3 |
+
size 3296
|
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/train_syntacc.py
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from trainer import Trainer, TrainerArgs
|
5 |
+
|
6 |
+
from TTS.bin.compute_embeddings import compute_embeddings
|
7 |
+
from TTS.bin.resample import resample_files
|
8 |
+
from TTS.config.shared_configs import BaseDatasetConfig
|
9 |
+
from TTS.tts.configs.vits_config import VitsConfig
|
10 |
+
from TTS.tts.datasets import load_tts_samples
|
11 |
+
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
|
12 |
+
from TTS.utils.downloaders import download_libri_tts
|
13 |
+
from torch.utils.data import DataLoader
|
14 |
+
from TTS.utils.samplers import PerfectBatchSampler
|
15 |
+
torch.set_num_threads(24)
|
16 |
+
|
17 |
+
# pylint: disable=W0105
|
18 |
+
"""
|
19 |
+
This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
|
20 |
+
YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
|
21 |
+
"""
|
22 |
+
CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
|
23 |
+
|
24 |
+
# Name of the run for the Trainer
|
25 |
+
RUN_NAME = "YourTTS-Syntacc-PT"
|
26 |
+
|
27 |
+
# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
|
28 |
+
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
29 |
+
|
30 |
+
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
31 |
+
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
32 |
+
|
33 |
+
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
34 |
+
SKIP_TRAIN_EPOCH = False
|
35 |
+
|
36 |
+
# Set here the batch size to be used in training and evaluation
|
37 |
+
BATCH_SIZE = 26
|
38 |
+
|
39 |
+
# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
|
40 |
+
# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
|
41 |
+
SAMPLE_RATE = 16000
|
42 |
+
|
43 |
+
|
44 |
+
DASHBOARD_LOGGER="tensorboard"
|
45 |
+
LOGGER_URI = None
|
46 |
+
|
47 |
+
DASHBOARD_LOGGER = "clearml"
|
48 |
+
LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
|
53 |
+
MAX_AUDIO_LEN_IN_SECONDS = float("inf")
|
54 |
+
|
55 |
+
# Define here the datasets config
|
56 |
+
brpb_train_config = BaseDatasetConfig(
|
57 |
+
formatter="coqui",
|
58 |
+
dataset_name="mupe",
|
59 |
+
meta_file_train="metadata_coqui_brpb.csv",
|
60 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
61 |
+
language="brpb"
|
62 |
+
)
|
63 |
+
|
64 |
+
brba_train_config = BaseDatasetConfig(
|
65 |
+
formatter="coqui",
|
66 |
+
dataset_name="mupe",
|
67 |
+
meta_file_train="metadata_coqui_brba.csv",
|
68 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
69 |
+
language="brba"
|
70 |
+
)
|
71 |
+
|
72 |
+
brportugal_train_config = BaseDatasetConfig(
|
73 |
+
formatter="coqui",
|
74 |
+
dataset_name="mupe",
|
75 |
+
meta_file_train="metadata_coqui_brportugal.csv",
|
76 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
77 |
+
language="brportugal"
|
78 |
+
)
|
79 |
+
|
80 |
+
brsp_train_config = BaseDatasetConfig(
|
81 |
+
formatter="coqui",
|
82 |
+
dataset_name="mupe",
|
83 |
+
meta_file_train="metadata_coqui_brsp.csv",
|
84 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
85 |
+
language="brsp"
|
86 |
+
)
|
87 |
+
|
88 |
+
brpe_train_config = BaseDatasetConfig(
|
89 |
+
formatter="coqui",
|
90 |
+
dataset_name="mupe",
|
91 |
+
meta_file_train="metadata_coqui_brpe.csv",
|
92 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
93 |
+
language="brpe"
|
94 |
+
)
|
95 |
+
|
96 |
+
brmg_train_config = BaseDatasetConfig(
|
97 |
+
formatter="coqui",
|
98 |
+
dataset_name="mupe",
|
99 |
+
meta_file_train="metadata_coqui_brmg.csv",
|
100 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
101 |
+
language="brmg"
|
102 |
+
)
|
103 |
+
|
104 |
+
brrj_train_config = BaseDatasetConfig(
|
105 |
+
formatter="coqui",
|
106 |
+
dataset_name="mupe",
|
107 |
+
meta_file_train="metadata_coqui_brrj.csv",
|
108 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
109 |
+
language="brrj"
|
110 |
+
)
|
111 |
+
|
112 |
+
brce_train_config = BaseDatasetConfig(
|
113 |
+
formatter="coqui",
|
114 |
+
dataset_name="mupe",
|
115 |
+
meta_file_train="metadata_coqui_brce.csv",
|
116 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
117 |
+
language="brce"
|
118 |
+
)
|
119 |
+
|
120 |
+
brrs_train_config = BaseDatasetConfig(
|
121 |
+
formatter="coqui",
|
122 |
+
dataset_name="mupe",
|
123 |
+
meta_file_train="metadata_coqui_brrs.csv",
|
124 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
125 |
+
language="brrs"
|
126 |
+
)
|
127 |
+
|
128 |
+
bralemanha_train_config = BaseDatasetConfig(
|
129 |
+
formatter="coqui",
|
130 |
+
dataset_name="mupe",
|
131 |
+
meta_file_train="metadata_coqui_bralemanha.csv",
|
132 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
133 |
+
language="bralemanha"
|
134 |
+
)
|
135 |
+
|
136 |
+
brgo_train_config = BaseDatasetConfig(
|
137 |
+
formatter="coqui",
|
138 |
+
dataset_name="mupe",
|
139 |
+
meta_file_train="metadata_coqui_brgo.csv",
|
140 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
141 |
+
language="brgo"
|
142 |
+
)
|
143 |
+
|
144 |
+
bral_train_config = BaseDatasetConfig(
|
145 |
+
formatter="coqui",
|
146 |
+
dataset_name="mupe",
|
147 |
+
meta_file_train="metadata_coqui_bral.csv",
|
148 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
149 |
+
language="bral"
|
150 |
+
)
|
151 |
+
|
152 |
+
brpr_train_config = BaseDatasetConfig(
|
153 |
+
formatter="coqui",
|
154 |
+
dataset_name="mupe",
|
155 |
+
meta_file_train="metadata_coqui_brpr.csv",
|
156 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
157 |
+
language="brpr"
|
158 |
+
)
|
159 |
+
|
160 |
+
bres_train_config = BaseDatasetConfig(
|
161 |
+
formatter="coqui",
|
162 |
+
dataset_name="mupe",
|
163 |
+
meta_file_train="metadata_coqui_bres.csv",
|
164 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
165 |
+
language="bres"
|
166 |
+
)
|
167 |
+
|
168 |
+
brpi_train_config = BaseDatasetConfig(
|
169 |
+
formatter="coqui",
|
170 |
+
dataset_name="mupe",
|
171 |
+
meta_file_train="metadata_coqui_brpi.csv",
|
172 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
173 |
+
language="brpi"
|
174 |
+
)
|
175 |
+
|
176 |
+
# bres_train_config, brpi_train_config no files found
|
177 |
+
DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
|
178 |
+
|
179 |
+
|
180 |
+
### Extract speaker embeddings
|
181 |
+
SPEAKER_ENCODER_CHECKPOINT_PATH = (
|
182 |
+
"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
|
183 |
+
)
|
184 |
+
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
|
185 |
+
|
186 |
+
D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
|
187 |
+
|
188 |
+
# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
|
189 |
+
for dataset_conf in DATASETS_CONFIG_LIST:
|
190 |
+
# Check if the embeddings weren't already computed, if not compute it
|
191 |
+
embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
|
192 |
+
if not os.path.isfile(embeddings_file):
|
193 |
+
print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
|
194 |
+
compute_embeddings(
|
195 |
+
SPEAKER_ENCODER_CHECKPOINT_PATH,
|
196 |
+
SPEAKER_ENCODER_CONFIG_PATH,
|
197 |
+
embeddings_file,
|
198 |
+
old_speakers_file=None,
|
199 |
+
config_dataset_path=None,
|
200 |
+
formatter_name=dataset_conf.formatter,
|
201 |
+
dataset_name=dataset_conf.dataset_name,
|
202 |
+
dataset_path=dataset_conf.path,
|
203 |
+
meta_file_train=dataset_conf.meta_file_train,
|
204 |
+
meta_file_val=dataset_conf.meta_file_val,
|
205 |
+
disable_cuda=False,
|
206 |
+
no_eval=False,
|
207 |
+
)
|
208 |
+
D_VECTOR_FILES.append(embeddings_file)
|
209 |
+
|
210 |
+
|
211 |
+
# Audio config used in training.
|
212 |
+
audio_config = VitsAudioConfig(
|
213 |
+
sample_rate=SAMPLE_RATE,
|
214 |
+
hop_length=256,
|
215 |
+
win_length=1024,
|
216 |
+
fft_size=1024,
|
217 |
+
mel_fmin=0.0,
|
218 |
+
mel_fmax=None,
|
219 |
+
num_mels=80,
|
220 |
+
)
|
221 |
+
|
222 |
+
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
223 |
+
model_args = VitsArgs(
|
224 |
+
spec_segment_size=62,
|
225 |
+
hidden_channels=192,
|
226 |
+
hidden_channels_ffn_text_encoder=768,
|
227 |
+
num_heads_text_encoder=2,
|
228 |
+
num_layers_text_encoder=10,
|
229 |
+
kernel_size_text_encoder=3,
|
230 |
+
dropout_p_text_encoder=0.1,
|
231 |
+
d_vector_file=D_VECTOR_FILES,
|
232 |
+
use_d_vector_file=True,
|
233 |
+
d_vector_dim=512,
|
234 |
+
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
235 |
+
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
236 |
+
resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
237 |
+
# Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
|
238 |
+
use_speaker_encoder_as_loss=False,
|
239 |
+
# Useful parameters to enable multilingual training
|
240 |
+
use_language_embedding=False,
|
241 |
+
embedded_language_dim=4,
|
242 |
+
use_adaptive_weight_text_encoder=True,
|
243 |
+
use_perfect_class_batch_sampler=True,
|
244 |
+
perfect_class_batch_sampler_key="language"
|
245 |
+
)
|
246 |
+
|
247 |
+
# General training config, here you can change the batch size and others useful parameters
|
248 |
+
config = VitsConfig(
|
249 |
+
output_path=OUT_PATH,
|
250 |
+
model_args=model_args,
|
251 |
+
run_name=RUN_NAME,
|
252 |
+
project_name="SYNTACC",
|
253 |
+
run_description="""
|
254 |
+
- YourTTS with SYNTACC text encoder
|
255 |
+
""",
|
256 |
+
dashboard_logger=DASHBOARD_LOGGER,
|
257 |
+
logger_uri=LOGGER_URI,
|
258 |
+
audio=audio_config,
|
259 |
+
batch_size=BATCH_SIZE,
|
260 |
+
batch_group_size=48,
|
261 |
+
eval_batch_size=BATCH_SIZE,
|
262 |
+
num_loader_workers=8,
|
263 |
+
eval_split_max_size=256,
|
264 |
+
print_step=50,
|
265 |
+
plot_step=100,
|
266 |
+
log_model_step=1000,
|
267 |
+
save_step=5000,
|
268 |
+
save_n_checkpoints=2,
|
269 |
+
save_checkpoints=True,
|
270 |
+
# target_loss="loss_1",
|
271 |
+
print_eval=False,
|
272 |
+
use_phonemes=False,
|
273 |
+
phonemizer="espeak",
|
274 |
+
phoneme_language="en",
|
275 |
+
compute_input_seq_cache=True,
|
276 |
+
add_blank=True,
|
277 |
+
text_cleaner="multilingual_cleaners",
|
278 |
+
characters=CharactersConfig(
|
279 |
+
characters_class="TTS.tts.models.vits.VitsCharacters",
|
280 |
+
pad="_",
|
281 |
+
eos="&",
|
282 |
+
bos="*",
|
283 |
+
blank=None,
|
284 |
+
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
285 |
+
punctuations="\u2014!'(),-.:;?\u00bf ",
|
286 |
+
phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
287 |
+
is_unique=True,
|
288 |
+
is_sorted=True,
|
289 |
+
),
|
290 |
+
phoneme_cache_path=None,
|
291 |
+
precompute_num_workers=12,
|
292 |
+
start_by_longest=True,
|
293 |
+
datasets=DATASETS_CONFIG_LIST,
|
294 |
+
cudnn_benchmark=False,
|
295 |
+
max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
|
296 |
+
mixed_precision=False,
|
297 |
+
test_sentences=[
|
298 |
+
#GUSTAVO: apenas pessoas do treino
|
299 |
+
["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
|
300 |
+
["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
|
301 |
+
["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
|
302 |
+
["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
|
303 |
+
["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
|
304 |
+
["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
|
305 |
+
["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
|
306 |
+
["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
|
307 |
+
["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
|
308 |
+
["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
|
309 |
+
["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
|
310 |
+
["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
|
311 |
+
["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
|
312 |
+
# ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
|
313 |
+
# ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
|
314 |
+
],
|
315 |
+
# Enable the weighted sampler
|
316 |
+
use_weighted_sampler=True,
|
317 |
+
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
318 |
+
# weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
|
319 |
+
weighted_sampler_attrs={"language": 1.0},
|
320 |
+
weighted_sampler_multipliers={
|
321 |
+
# "speaker_name": {
|
322 |
+
# you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
|
323 |
+
# It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
|
324 |
+
# The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
|
325 |
+
# 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
|
326 |
+
# }
|
327 |
+
},
|
328 |
+
# It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
|
329 |
+
speaker_encoder_loss_alpha=9.0,
|
330 |
+
)
|
331 |
+
|
332 |
+
# Load all the datasets samples and split traning and evaluation sets
|
333 |
+
train_samples, eval_samples = load_tts_samples(
|
334 |
+
config.datasets,
|
335 |
+
eval_split=True,
|
336 |
+
eval_split_max_size=config.eval_split_max_size,
|
337 |
+
eval_split_size=config.eval_split_size,
|
338 |
+
)
|
339 |
+
|
340 |
+
# Init the model
|
341 |
+
model = Vits.init_from_config(config)
|
342 |
+
|
343 |
+
# Init the trainer and 🚀
|
344 |
+
trainer = Trainer(
|
345 |
+
TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
|
346 |
+
config,
|
347 |
+
output_path=OUT_PATH,
|
348 |
+
model=model,
|
349 |
+
train_samples=train_samples,
|
350 |
+
eval_samples=eval_samples,
|
351 |
+
)
|
352 |
+
trainer.fit()
|
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/trainer_0_log.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dc570d138b2c5d578e697b92e2d2d060c0945fcd0f880761a3fc800eaf619b6a
|
3 |
+
size 97918
|
Experiments/train_syntacc.py
ADDED
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from trainer import Trainer, TrainerArgs
|
5 |
+
|
6 |
+
from TTS.bin.compute_embeddings import compute_embeddings
|
7 |
+
from TTS.bin.resample import resample_files
|
8 |
+
from TTS.config.shared_configs import BaseDatasetConfig
|
9 |
+
from TTS.tts.configs.vits_config import VitsConfig
|
10 |
+
from TTS.tts.datasets import load_tts_samples
|
11 |
+
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
|
12 |
+
from TTS.utils.downloaders import download_libri_tts
|
13 |
+
from torch.utils.data import DataLoader
|
14 |
+
from TTS.utils.samplers import PerfectBatchSampler
|
15 |
+
torch.set_num_threads(24)
|
16 |
+
|
17 |
+
# pylint: disable=W0105
|
18 |
+
"""
|
19 |
+
This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
|
20 |
+
YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
|
21 |
+
"""
|
22 |
+
CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
|
23 |
+
|
24 |
+
# Name of the run for the Trainer
|
25 |
+
RUN_NAME = "YourTTS-Syntacc-PT"
|
26 |
+
|
27 |
+
# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
|
28 |
+
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
|
29 |
+
|
30 |
+
# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
31 |
+
RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
|
32 |
+
|
33 |
+
# This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
|
34 |
+
SKIP_TRAIN_EPOCH = False
|
35 |
+
|
36 |
+
# Set here the batch size to be used in training and evaluation
|
37 |
+
BATCH_SIZE = 26
|
38 |
+
|
39 |
+
# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
|
40 |
+
# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
|
41 |
+
SAMPLE_RATE = 16000
|
42 |
+
|
43 |
+
|
44 |
+
DASHBOARD_LOGGER="tensorboard"
|
45 |
+
LOGGER_URI = None
|
46 |
+
|
47 |
+
DASHBOARD_LOGGER = "clearml"
|
48 |
+
LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
|
53 |
+
MAX_AUDIO_LEN_IN_SECONDS = float("inf")
|
54 |
+
|
55 |
+
# Define here the datasets config
|
56 |
+
brpb_train_config = BaseDatasetConfig(
|
57 |
+
formatter="coqui",
|
58 |
+
dataset_name="mupe",
|
59 |
+
meta_file_train="metadata_coqui_brpb.csv",
|
60 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
61 |
+
language="brpb"
|
62 |
+
)
|
63 |
+
|
64 |
+
brba_train_config = BaseDatasetConfig(
|
65 |
+
formatter="coqui",
|
66 |
+
dataset_name="mupe",
|
67 |
+
meta_file_train="metadata_coqui_brba.csv",
|
68 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
69 |
+
language="brba"
|
70 |
+
)
|
71 |
+
|
72 |
+
brportugal_train_config = BaseDatasetConfig(
|
73 |
+
formatter="coqui",
|
74 |
+
dataset_name="mupe",
|
75 |
+
meta_file_train="metadata_coqui_brportugal.csv",
|
76 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
77 |
+
language="brportugal"
|
78 |
+
)
|
79 |
+
|
80 |
+
brsp_train_config = BaseDatasetConfig(
|
81 |
+
formatter="coqui",
|
82 |
+
dataset_name="mupe",
|
83 |
+
meta_file_train="metadata_coqui_brsp.csv",
|
84 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
85 |
+
language="brsp"
|
86 |
+
)
|
87 |
+
|
88 |
+
brpe_train_config = BaseDatasetConfig(
|
89 |
+
formatter="coqui",
|
90 |
+
dataset_name="mupe",
|
91 |
+
meta_file_train="metadata_coqui_brpe.csv",
|
92 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
93 |
+
language="brpe"
|
94 |
+
)
|
95 |
+
|
96 |
+
brmg_train_config = BaseDatasetConfig(
|
97 |
+
formatter="coqui",
|
98 |
+
dataset_name="mupe",
|
99 |
+
meta_file_train="metadata_coqui_brmg.csv",
|
100 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
101 |
+
language="brmg"
|
102 |
+
)
|
103 |
+
|
104 |
+
brrj_train_config = BaseDatasetConfig(
|
105 |
+
formatter="coqui",
|
106 |
+
dataset_name="mupe",
|
107 |
+
meta_file_train="metadata_coqui_brrj.csv",
|
108 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
109 |
+
language="brrj"
|
110 |
+
)
|
111 |
+
|
112 |
+
brce_train_config = BaseDatasetConfig(
|
113 |
+
formatter="coqui",
|
114 |
+
dataset_name="mupe",
|
115 |
+
meta_file_train="metadata_coqui_brce.csv",
|
116 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
117 |
+
language="brce"
|
118 |
+
)
|
119 |
+
|
120 |
+
brrs_train_config = BaseDatasetConfig(
|
121 |
+
formatter="coqui",
|
122 |
+
dataset_name="mupe",
|
123 |
+
meta_file_train="metadata_coqui_brrs.csv",
|
124 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
125 |
+
language="brrs"
|
126 |
+
)
|
127 |
+
|
128 |
+
bralemanha_train_config = BaseDatasetConfig(
|
129 |
+
formatter="coqui",
|
130 |
+
dataset_name="mupe",
|
131 |
+
meta_file_train="metadata_coqui_bralemanha.csv",
|
132 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
133 |
+
language="bralemanha"
|
134 |
+
)
|
135 |
+
|
136 |
+
brgo_train_config = BaseDatasetConfig(
|
137 |
+
formatter="coqui",
|
138 |
+
dataset_name="mupe",
|
139 |
+
meta_file_train="metadata_coqui_brgo.csv",
|
140 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
141 |
+
language="brgo"
|
142 |
+
)
|
143 |
+
|
144 |
+
bral_train_config = BaseDatasetConfig(
|
145 |
+
formatter="coqui",
|
146 |
+
dataset_name="mupe",
|
147 |
+
meta_file_train="metadata_coqui_bral.csv",
|
148 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
149 |
+
language="bral"
|
150 |
+
)
|
151 |
+
|
152 |
+
brpr_train_config = BaseDatasetConfig(
|
153 |
+
formatter="coqui",
|
154 |
+
dataset_name="mupe",
|
155 |
+
meta_file_train="metadata_coqui_brpr.csv",
|
156 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
157 |
+
language="brpr"
|
158 |
+
)
|
159 |
+
|
160 |
+
bres_train_config = BaseDatasetConfig(
|
161 |
+
formatter="coqui",
|
162 |
+
dataset_name="mupe",
|
163 |
+
meta_file_train="metadata_coqui_bres.csv",
|
164 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
165 |
+
language="bres"
|
166 |
+
)
|
167 |
+
|
168 |
+
brpi_train_config = BaseDatasetConfig(
|
169 |
+
formatter="coqui",
|
170 |
+
dataset_name="mupe",
|
171 |
+
meta_file_train="metadata_coqui_brpi.csv",
|
172 |
+
path="/raid/datasets/MUPE/dataset/mupe/",
|
173 |
+
language="brpi"
|
174 |
+
)
|
175 |
+
|
176 |
+
# bres_train_config, brpi_train_config no files found
|
177 |
+
DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
|
178 |
+
|
179 |
+
|
180 |
+
### Extract speaker embeddings
|
181 |
+
SPEAKER_ENCODER_CHECKPOINT_PATH = (
|
182 |
+
"https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
|
183 |
+
)
|
184 |
+
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
|
185 |
+
|
186 |
+
D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
|
187 |
+
|
188 |
+
# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
|
189 |
+
for dataset_conf in DATASETS_CONFIG_LIST:
|
190 |
+
# Check if the embeddings weren't already computed, if not compute it
|
191 |
+
embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
|
192 |
+
if not os.path.isfile(embeddings_file):
|
193 |
+
print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
|
194 |
+
compute_embeddings(
|
195 |
+
SPEAKER_ENCODER_CHECKPOINT_PATH,
|
196 |
+
SPEAKER_ENCODER_CONFIG_PATH,
|
197 |
+
embeddings_file,
|
198 |
+
old_speakers_file=None,
|
199 |
+
config_dataset_path=None,
|
200 |
+
formatter_name=dataset_conf.formatter,
|
201 |
+
dataset_name=dataset_conf.dataset_name,
|
202 |
+
dataset_path=dataset_conf.path,
|
203 |
+
meta_file_train=dataset_conf.meta_file_train,
|
204 |
+
meta_file_val=dataset_conf.meta_file_val,
|
205 |
+
disable_cuda=False,
|
206 |
+
no_eval=False,
|
207 |
+
)
|
208 |
+
D_VECTOR_FILES.append(embeddings_file)
|
209 |
+
|
210 |
+
|
211 |
+
# Audio config used in training.
|
212 |
+
audio_config = VitsAudioConfig(
|
213 |
+
sample_rate=SAMPLE_RATE,
|
214 |
+
hop_length=256,
|
215 |
+
win_length=1024,
|
216 |
+
fft_size=1024,
|
217 |
+
mel_fmin=0.0,
|
218 |
+
mel_fmax=None,
|
219 |
+
num_mels=80,
|
220 |
+
)
|
221 |
+
|
222 |
+
# Init VITSArgs setting the arguments that are needed for the YourTTS model
|
223 |
+
model_args = VitsArgs(
|
224 |
+
spec_segment_size=62,
|
225 |
+
hidden_channels=192,
|
226 |
+
hidden_channels_ffn_text_encoder=768,
|
227 |
+
num_heads_text_encoder=2,
|
228 |
+
num_layers_text_encoder=10,
|
229 |
+
kernel_size_text_encoder=3,
|
230 |
+
dropout_p_text_encoder=0.1,
|
231 |
+
d_vector_file=D_VECTOR_FILES,
|
232 |
+
use_d_vector_file=True,
|
233 |
+
d_vector_dim=512,
|
234 |
+
speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
|
235 |
+
speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
|
236 |
+
resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
|
237 |
+
# Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
|
238 |
+
use_speaker_encoder_as_loss=False,
|
239 |
+
# Useful parameters to enable multilingual training
|
240 |
+
use_language_embedding=False,
|
241 |
+
embedded_language_dim=4,
|
242 |
+
use_adaptive_weight_text_encoder=True,
|
243 |
+
use_perfect_class_batch_sampler=True,
|
244 |
+
perfect_class_batch_sampler_key="language"
|
245 |
+
)
|
246 |
+
|
247 |
+
# General training config, here you can change the batch size and others useful parameters
|
248 |
+
config = VitsConfig(
|
249 |
+
output_path=OUT_PATH,
|
250 |
+
model_args=model_args,
|
251 |
+
run_name=RUN_NAME,
|
252 |
+
project_name="SYNTACC",
|
253 |
+
run_description="""
|
254 |
+
- YourTTS with SYNTACC text encoder
|
255 |
+
""",
|
256 |
+
dashboard_logger=DASHBOARD_LOGGER,
|
257 |
+
logger_uri=LOGGER_URI,
|
258 |
+
audio=audio_config,
|
259 |
+
batch_size=BATCH_SIZE,
|
260 |
+
batch_group_size=48,
|
261 |
+
eval_batch_size=BATCH_SIZE,
|
262 |
+
num_loader_workers=8,
|
263 |
+
eval_split_max_size=256,
|
264 |
+
print_step=50,
|
265 |
+
plot_step=100,
|
266 |
+
log_model_step=1000,
|
267 |
+
save_step=5000,
|
268 |
+
save_n_checkpoints=2,
|
269 |
+
save_checkpoints=True,
|
270 |
+
# target_loss="loss_1",
|
271 |
+
print_eval=False,
|
272 |
+
use_phonemes=False,
|
273 |
+
phonemizer="espeak",
|
274 |
+
phoneme_language="en",
|
275 |
+
compute_input_seq_cache=True,
|
276 |
+
add_blank=True,
|
277 |
+
text_cleaner="multilingual_cleaners",
|
278 |
+
characters=CharactersConfig(
|
279 |
+
characters_class="TTS.tts.models.vits.VitsCharacters",
|
280 |
+
pad="_",
|
281 |
+
eos="&",
|
282 |
+
bos="*",
|
283 |
+
blank=None,
|
284 |
+
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
|
285 |
+
punctuations="\u2014!'(),-.:;?\u00bf ",
|
286 |
+
phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
|
287 |
+
is_unique=True,
|
288 |
+
is_sorted=True,
|
289 |
+
),
|
290 |
+
phoneme_cache_path=None,
|
291 |
+
precompute_num_workers=12,
|
292 |
+
start_by_longest=True,
|
293 |
+
datasets=DATASETS_CONFIG_LIST,
|
294 |
+
cudnn_benchmark=False,
|
295 |
+
max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
|
296 |
+
mixed_precision=False,
|
297 |
+
test_sentences=[
|
298 |
+
#GUSTAVO: apenas pessoas do treino
|
299 |
+
["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
|
300 |
+
["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
|
301 |
+
["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
|
302 |
+
["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
|
303 |
+
["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
|
304 |
+
["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
|
305 |
+
["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
|
306 |
+
["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
|
307 |
+
["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
|
308 |
+
["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
|
309 |
+
["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
|
310 |
+
["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
|
311 |
+
["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
|
312 |
+
# ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
|
313 |
+
# ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
|
314 |
+
],
|
315 |
+
# Enable the weighted sampler
|
316 |
+
use_weighted_sampler=True,
|
317 |
+
# Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
|
318 |
+
# weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
|
319 |
+
weighted_sampler_attrs={"language": 1.0},
|
320 |
+
weighted_sampler_multipliers={
|
321 |
+
# "speaker_name": {
|
322 |
+
# you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
|
323 |
+
# It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
|
324 |
+
# The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
|
325 |
+
# 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
|
326 |
+
# }
|
327 |
+
},
|
328 |
+
# It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
|
329 |
+
speaker_encoder_loss_alpha=9.0,
|
330 |
+
)
|
331 |
+
|
332 |
+
# Load all the datasets samples and split traning and evaluation sets
|
333 |
+
train_samples, eval_samples = load_tts_samples(
|
334 |
+
config.datasets,
|
335 |
+
eval_split=True,
|
336 |
+
eval_split_max_size=config.eval_split_max_size,
|
337 |
+
eval_split_size=config.eval_split_size,
|
338 |
+
)
|
339 |
+
|
340 |
+
# Init the model
|
341 |
+
model = Vits.init_from_config(config)
|
342 |
+
|
343 |
+
# Init the trainer and 🚀
|
344 |
+
trainer = Trainer(
|
345 |
+
TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
|
346 |
+
config,
|
347 |
+
output_path=OUT_PATH,
|
348 |
+
model=model,
|
349 |
+
train_samples=train_samples,
|
350 |
+
eval_samples=eval_samples,
|
351 |
+
)
|
352 |
+
trainer.fit()
|