Edresson commited on
Commit
1dfa9fd
·
1 Parent(s): 2bc0892
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +16 -34
  2. .gitignore +3 -0
  3. Experiments/TTS +1 -0
  4. Experiments/nohup.out +3 -0
  5. Experiments/run/events.out.tfevents.1706367627.edresson-train-80.45395.0 +3 -0
  6. Experiments/run/events.out.tfevents.1706367849.edresson-train-80.46052.0 +3 -0
  7. Experiments/run/events.out.tfevents.1706367954.edresson-train-80.46941.0 +3 -0
  8. Experiments/run/events.out.tfevents.1706446227.edresson-train-80.140666.0 +3 -0
  9. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model.pth +3 -0
  10. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model_124752.pth +3 -0
  11. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_130000.pth +3 -0
  12. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_135000.pth +3 -0
  13. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/config.json +496 -0
  14. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json +15 -0
  15. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth +3 -0
  16. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/train_syntacc_baseline.py +352 -0
  17. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/trainer_0_log.txt +3 -0
  18. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model.pth +3 -0
  19. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model_85001.pth +3 -0
  20. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/config.json +496 -0
  21. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json +15 -0
  22. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth +3 -0
  23. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/train_syntacc_baseline.py +352 -0
  24. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/trainer_0_log.txt +3 -0
  25. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model.pth +3 -0
  26. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model_87192.pth +3 -0
  27. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_130000.pth +3 -0
  28. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_135000.pth +3 -0
  29. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/config.json +496 -0
  30. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json +15 -0
  31. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth +3 -0
  32. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/train_syntacc_baseline.py +352 -0
  33. Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/trainer_0_log.txt +3 -0
  34. Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/best_model.pth +3 -0
  35. Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/best_model_78415.pth +3 -0
  36. Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_80000.pth +3 -0
  37. Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth +3 -0
  38. Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/config.json +496 -0
  39. Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/language_ids.json +15 -0
  40. Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/speakers.pth +3 -0
  41. Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/train_syntacc.py +352 -0
  42. Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/trainer_0_log.txt +3 -0
  43. Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/best_model.pth +3 -0
  44. Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/best_model_87818.pth +3 -0
  45. Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/config.json +496 -0
  46. Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/language_ids.json +15 -0
  47. Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/speakers.pth +3 -0
  48. Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/train_syntacc.py +352 -0
  49. Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/trainer_0_log.txt +3 -0
  50. Experiments/train_syntacc.py +352 -0
.gitattributes CHANGED
@@ -1,35 +1,17 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.txt filter=lfs diff=lfs merge=lfs -text
2
+ *.t7 filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  *.zip filter=lfs diff=lfs merge=lfs -text
4
+ *.pth filter=lfs diff=lfs merge=lfs -text
5
+ *.wav filter=lfs diff=lfs merge=lfs -text
6
+ *.pt filter=lfs diff=lfs merge=lfs -text
7
+ *.out filter=lfs diff=lfs merge=lfs -text
8
+ *.0 filter=lfs diff=lfs merge=lfs -text
9
+ *.csv filter=lfs diff=lfs merge=lfs -text
10
+ *.o filter=lfs diff=lfs merge=lfs -text
11
+ *.so filter=lfs diff=lfs merge=lfs -text
12
+ HierSpeech_TTS/denoiser/g_best filter=lfs diff=lfs merge=lfs -text
13
+ g_best filter=lfs diff=lfs merge=lfs -text
14
+ TTS-private/nohup.out filter=lfs diff=lfs merge=lfs -text
15
+ nohup.out filter=lfs diff=lfs merge=lfs -text
16
+ TTS-private/run/events.out.tfevents.1705084461.edresson-train-80-2.93786.0 filter=lfs diff=lfs merge=lfs -text
17
+ events.out.tfevents.1705084461.edresson-train-80-2.93786.0 filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ */.git/*
2
+ .git
3
+ .git/*
Experiments/TTS ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit a45dfd62668cd5778dd6a384308097ba0370c034
Experiments/nohup.out ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b67f0b1dbf0a04937b9b6db1e55de4cc6057c5f1832ebb1a8c4e3c2f4b5a9e6
3
+ size 19940074
Experiments/run/events.out.tfevents.1706367627.edresson-train-80.45395.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa9dfc334b721e0bc90371cdf519b1bb244a637f4c56ce0ef949f76b5848ee8d
3
+ size 347255243
Experiments/run/events.out.tfevents.1706367849.edresson-train-80.46052.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90283b4420a0282755e4425857d26cf58a98b7229057b9e6a5aca19014168184
3
+ size 1238111
Experiments/run/events.out.tfevents.1706367954.edresson-train-80.46941.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fb96459ffb6c55b7e51e5ee3a13a2f265f2c1579a78756e789583906320e81e
3
+ size 350277161
Experiments/run/events.out.tfevents.1706446227.edresson-train-80.140666.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15518ce7cf33a4ff76601734d638436b43895ab61e6b4efe25b5a24b495f529c
3
+ size 21123264
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c62e29c7a1dd4f701ab4998e0b1f569cfe7486cc7806f149c1ff857f172383e0
3
+ size 1043220702
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model_124752.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c62e29c7a1dd4f701ab4998e0b1f569cfe7486cc7806f149c1ff857f172383e0
3
+ size 1043220702
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_130000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a71ead47e605fc525b264ad882fd54630c15a42eb69aaf88993d26d5ea84ae3b
3
+ size 1043220766
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_135000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96e16ee83729813041c17f6edf8a702bdf59e7afe345cfad1fe65dd4ba0b1fce
3
+ size 1043220766
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/config.json ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/raid/datasets/MUPE/Experiments/runs",
3
+ "logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
4
+ "run_name": "YourTTS-Baseline-PT",
5
+ "project_name": "SYNTACC",
6
+ "run_description": "\n - YourTTS with SYNTACC text encoder\n ",
7
+ "print_step": 50,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "clearml",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": 1000,
14
+ "save_step": 5000,
15
+ "save_n_checkpoints": 2,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 10000,
19
+ "target_loss": null,
20
+ "print_eval": false,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 26,
30
+ "eval_batch_size": 26,
31
+ "grad_clip": [
32
+ 1000,
33
+ 1000
34
+ ],
35
+ "scheduler_after_epoch": true,
36
+ "lr": 0.001,
37
+ "optimizer": "AdamW",
38
+ "optimizer_params": {
39
+ "betas": [
40
+ 0.8,
41
+ 0.99
42
+ ],
43
+ "eps": 1e-09,
44
+ "weight_decay": 0.01
45
+ },
46
+ "lr_scheduler": null,
47
+ "lr_scheduler_params": {},
48
+ "use_grad_scaler": false,
49
+ "allow_tf32": false,
50
+ "cudnn_enable": true,
51
+ "cudnn_deterministic": false,
52
+ "cudnn_benchmark": false,
53
+ "training_seed": 54321,
54
+ "model": "vits",
55
+ "num_loader_workers": 8,
56
+ "num_eval_loader_workers": 0,
57
+ "use_noise_augment": false,
58
+ "audio": {
59
+ "fft_size": 1024,
60
+ "sample_rate": 16000,
61
+ "win_length": 1024,
62
+ "hop_length": 256,
63
+ "num_mels": 80,
64
+ "mel_fmin": 0.0,
65
+ "mel_fmax": null
66
+ },
67
+ "use_phonemes": false,
68
+ "phonemizer": "espeak",
69
+ "phoneme_language": "en",
70
+ "compute_input_seq_cache": true,
71
+ "text_cleaner": "multilingual_cleaners",
72
+ "enable_eos_bos_chars": false,
73
+ "test_sentences_file": "",
74
+ "phoneme_cache_path": null,
75
+ "characters": {
76
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
77
+ "vocab_dict": null,
78
+ "pad": "_",
79
+ "eos": "&",
80
+ "bos": "*",
81
+ "blank": null,
82
+ "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
83
+ "punctuations": "\u2014!'(),-.:;?\u00bf ",
84
+ "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
85
+ "is_unique": true,
86
+ "is_sorted": true
87
+ },
88
+ "add_blank": true,
89
+ "batch_group_size": 48,
90
+ "loss_masking": null,
91
+ "min_audio_len": 1,
92
+ "max_audio_len": Infinity,
93
+ "min_text_len": 1,
94
+ "max_text_len": Infinity,
95
+ "compute_f0": false,
96
+ "compute_energy": false,
97
+ "compute_linear_spec": true,
98
+ "precompute_num_workers": 12,
99
+ "start_by_longest": true,
100
+ "shuffle": false,
101
+ "drop_last": false,
102
+ "datasets": [
103
+ {
104
+ "formatter": "coqui",
105
+ "dataset_name": "mupe",
106
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
107
+ "meta_file_train": "metadata_coqui_brpb.csv",
108
+ "ignored_speakers": null,
109
+ "language": "brpb",
110
+ "phonemizer": "",
111
+ "meta_file_val": "",
112
+ "meta_file_attn_mask": ""
113
+ },
114
+ {
115
+ "formatter": "coqui",
116
+ "dataset_name": "mupe",
117
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
118
+ "meta_file_train": "metadata_coqui_brba.csv",
119
+ "ignored_speakers": null,
120
+ "language": "brba",
121
+ "phonemizer": "",
122
+ "meta_file_val": "",
123
+ "meta_file_attn_mask": ""
124
+ },
125
+ {
126
+ "formatter": "coqui",
127
+ "dataset_name": "mupe",
128
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
129
+ "meta_file_train": "metadata_coqui_brportugal.csv",
130
+ "ignored_speakers": null,
131
+ "language": "brportugal",
132
+ "phonemizer": "",
133
+ "meta_file_val": "",
134
+ "meta_file_attn_mask": ""
135
+ },
136
+ {
137
+ "formatter": "coqui",
138
+ "dataset_name": "mupe",
139
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
140
+ "meta_file_train": "metadata_coqui_brsp.csv",
141
+ "ignored_speakers": null,
142
+ "language": "brsp",
143
+ "phonemizer": "",
144
+ "meta_file_val": "",
145
+ "meta_file_attn_mask": ""
146
+ },
147
+ {
148
+ "formatter": "coqui",
149
+ "dataset_name": "mupe",
150
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
151
+ "meta_file_train": "metadata_coqui_brpe.csv",
152
+ "ignored_speakers": null,
153
+ "language": "brpe",
154
+ "phonemizer": "",
155
+ "meta_file_val": "",
156
+ "meta_file_attn_mask": ""
157
+ },
158
+ {
159
+ "formatter": "coqui",
160
+ "dataset_name": "mupe",
161
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
162
+ "meta_file_train": "metadata_coqui_brmg.csv",
163
+ "ignored_speakers": null,
164
+ "language": "brmg",
165
+ "phonemizer": "",
166
+ "meta_file_val": "",
167
+ "meta_file_attn_mask": ""
168
+ },
169
+ {
170
+ "formatter": "coqui",
171
+ "dataset_name": "mupe",
172
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
173
+ "meta_file_train": "metadata_coqui_brrj.csv",
174
+ "ignored_speakers": null,
175
+ "language": "brrj",
176
+ "phonemizer": "",
177
+ "meta_file_val": "",
178
+ "meta_file_attn_mask": ""
179
+ },
180
+ {
181
+ "formatter": "coqui",
182
+ "dataset_name": "mupe",
183
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
184
+ "meta_file_train": "metadata_coqui_brce.csv",
185
+ "ignored_speakers": null,
186
+ "language": "brce",
187
+ "phonemizer": "",
188
+ "meta_file_val": "",
189
+ "meta_file_attn_mask": ""
190
+ },
191
+ {
192
+ "formatter": "coqui",
193
+ "dataset_name": "mupe",
194
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
195
+ "meta_file_train": "metadata_coqui_brrs.csv",
196
+ "ignored_speakers": null,
197
+ "language": "brrs",
198
+ "phonemizer": "",
199
+ "meta_file_val": "",
200
+ "meta_file_attn_mask": ""
201
+ },
202
+ {
203
+ "formatter": "coqui",
204
+ "dataset_name": "mupe",
205
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
206
+ "meta_file_train": "metadata_coqui_bralemanha.csv",
207
+ "ignored_speakers": null,
208
+ "language": "bralemanha",
209
+ "phonemizer": "",
210
+ "meta_file_val": "",
211
+ "meta_file_attn_mask": ""
212
+ },
213
+ {
214
+ "formatter": "coqui",
215
+ "dataset_name": "mupe",
216
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
217
+ "meta_file_train": "metadata_coqui_brgo.csv",
218
+ "ignored_speakers": null,
219
+ "language": "brgo",
220
+ "phonemizer": "",
221
+ "meta_file_val": "",
222
+ "meta_file_attn_mask": ""
223
+ },
224
+ {
225
+ "formatter": "coqui",
226
+ "dataset_name": "mupe",
227
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
228
+ "meta_file_train": "metadata_coqui_bral.csv",
229
+ "ignored_speakers": null,
230
+ "language": "bral",
231
+ "phonemizer": "",
232
+ "meta_file_val": "",
233
+ "meta_file_attn_mask": ""
234
+ },
235
+ {
236
+ "formatter": "coqui",
237
+ "dataset_name": "mupe",
238
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
239
+ "meta_file_train": "metadata_coqui_brpr.csv",
240
+ "ignored_speakers": null,
241
+ "language": "brpr",
242
+ "phonemizer": "",
243
+ "meta_file_val": "",
244
+ "meta_file_attn_mask": ""
245
+ }
246
+ ],
247
+ "test_sentences": [
248
+ [
249
+ "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
250
+ "EDILEINE_FONSECA",
251
+ null,
252
+ "brsp"
253
+ ],
254
+ [
255
+ "Quem semeia ventos, colhe tempestades.",
256
+ "JOSE_PAULO_DE_ARAUJO",
257
+ null,
258
+ "brpb"
259
+ ],
260
+ [
261
+ "O olho do dono \u00e9 que engorda o gado.",
262
+ "VITOR_RAFAEL_OLIVEIRA_ALVES",
263
+ null,
264
+ "brba"
265
+ ],
266
+ [
267
+ "\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
268
+ "MARIA_AURORA_FELIX",
269
+ null,
270
+ "brportugal"
271
+ ],
272
+ [
273
+ "Quem espera sempre alcan\u00e7a.",
274
+ "ANTONIO_DE_AMORIM_COSTA",
275
+ null,
276
+ "brpe"
277
+ ],
278
+ [
279
+ "Cada macaco no seu galho.",
280
+ "ALCIDES_DE_LIMA",
281
+ null,
282
+ "brmg"
283
+ ],
284
+ [
285
+ "Em terra de cego, quem tem um olho \u00e9 rei.",
286
+ "ALUISIO_SOARES_DE_SOUSA",
287
+ null,
288
+ "brrj"
289
+ ],
290
+ [
291
+ "A ocasi\u00e3o faz o ladr\u00e3o.",
292
+ "FRANCISCO_JOSE_MOREIRA_MOTA",
293
+ null,
294
+ "brce"
295
+ ],
296
+ [
297
+ "De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
298
+ "EVALDO_ANDRADA_CORREA",
299
+ null,
300
+ "brrs"
301
+ ],
302
+ [
303
+ "Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
304
+ "DORIS_ALEXANDER",
305
+ null,
306
+ "bralemanha"
307
+ ],
308
+ [
309
+ "Quem n\u00e3o arrisca, n\u00e3o petisca.",
310
+ "DONALDO_LUIZ_DE_ALMEIDA",
311
+ null,
312
+ "brgo"
313
+ ],
314
+ [
315
+ "A uni\u00e3o faz a for\u00e7a.",
316
+ "GERONCIO_HENRIQUE_NETO",
317
+ null,
318
+ "bral"
319
+ ],
320
+ [
321
+ "Em boca fechada n\u00e3o entra mosquito.",
322
+ "MALU_NATEL_FREIRE_WEBER",
323
+ null,
324
+ "brpr"
325
+ ]
326
+ ],
327
+ "eval_split_max_size": 256,
328
+ "eval_split_size": 0.01,
329
+ "use_speaker_weighted_sampler": false,
330
+ "speaker_weighted_sampler_alpha": 1.0,
331
+ "use_language_weighted_sampler": false,
332
+ "language_weighted_sampler_alpha": 1.0,
333
+ "use_length_weighted_sampler": false,
334
+ "length_weighted_sampler_alpha": 1.0,
335
+ "model_args": {
336
+ "num_chars": 266,
337
+ "out_channels": 513,
338
+ "spec_segment_size": 62,
339
+ "hidden_channels": 192,
340
+ "use_adaptive_weight_text_encoder": false,
341
+ "use_perfect_class_batch_sampler": true,
342
+ "perfect_class_batch_sampler_key": "language",
343
+ "hidden_channels_ffn_text_encoder": 768,
344
+ "num_heads_text_encoder": 2,
345
+ "num_layers_text_encoder": 10,
346
+ "kernel_size_text_encoder": 3,
347
+ "dropout_p_text_encoder": 0.1,
348
+ "dropout_p_duration_predictor": 0.5,
349
+ "kernel_size_posterior_encoder": 5,
350
+ "dilation_rate_posterior_encoder": 1,
351
+ "num_layers_posterior_encoder": 16,
352
+ "kernel_size_flow": 5,
353
+ "dilation_rate_flow": 1,
354
+ "num_layers_flow": 4,
355
+ "resblock_type_decoder": "2",
356
+ "resblock_kernel_sizes_decoder": [
357
+ 3,
358
+ 7,
359
+ 11
360
+ ],
361
+ "resblock_dilation_sizes_decoder": [
362
+ [
363
+ 1,
364
+ 3,
365
+ 5
366
+ ],
367
+ [
368
+ 1,
369
+ 3,
370
+ 5
371
+ ],
372
+ [
373
+ 1,
374
+ 3,
375
+ 5
376
+ ]
377
+ ],
378
+ "upsample_rates_decoder": [
379
+ 8,
380
+ 8,
381
+ 2,
382
+ 2
383
+ ],
384
+ "upsample_initial_channel_decoder": 512,
385
+ "upsample_kernel_sizes_decoder": [
386
+ 16,
387
+ 16,
388
+ 4,
389
+ 4
390
+ ],
391
+ "periods_multi_period_discriminator": [
392
+ 2,
393
+ 3,
394
+ 5,
395
+ 7,
396
+ 11
397
+ ],
398
+ "use_sdp": true,
399
+ "noise_scale": 1.0,
400
+ "inference_noise_scale": 0.667,
401
+ "length_scale": 1,
402
+ "noise_scale_dp": 1.0,
403
+ "inference_noise_scale_dp": 1.0,
404
+ "max_inference_len": null,
405
+ "init_discriminator": true,
406
+ "use_spectral_norm_disriminator": false,
407
+ "use_speaker_embedding": false,
408
+ "num_speakers": 0,
409
+ "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth",
410
+ "d_vector_file": [
411
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
412
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
413
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
414
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
415
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
416
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
417
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
418
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
419
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
420
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
421
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
422
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
423
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
424
+ ],
425
+ "speaker_embedding_channels": 256,
426
+ "use_d_vector_file": true,
427
+ "d_vector_dim": 512,
428
+ "detach_dp_input": true,
429
+ "use_language_embedding": true,
430
+ "embedded_language_dim": 4,
431
+ "num_languages": 0,
432
+ "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json",
433
+ "use_speaker_encoder_as_loss": false,
434
+ "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
435
+ "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
436
+ "condition_dp_on_speaker": true,
437
+ "freeze_encoder": false,
438
+ "freeze_DP": false,
439
+ "freeze_PE": false,
440
+ "freeze_flow_decoder": false,
441
+ "freeze_waveform_decoder": false,
442
+ "encoder_sample_rate": null,
443
+ "interpolate_z": true,
444
+ "reinit_DP": false,
445
+ "reinit_text_encoder": false
446
+ },
447
+ "lr_gen": 0.0002,
448
+ "lr_disc": 0.0002,
449
+ "lr_scheduler_gen": "ExponentialLR",
450
+ "lr_scheduler_gen_params": {
451
+ "gamma": 0.999875,
452
+ "last_epoch": -1
453
+ },
454
+ "lr_scheduler_disc": "ExponentialLR",
455
+ "lr_scheduler_disc_params": {
456
+ "gamma": 0.999875,
457
+ "last_epoch": -1
458
+ },
459
+ "kl_loss_alpha": 1.0,
460
+ "disc_loss_alpha": 1.0,
461
+ "gen_loss_alpha": 1.0,
462
+ "feat_loss_alpha": 1.0,
463
+ "mel_loss_alpha": 45.0,
464
+ "dur_loss_alpha": 1.0,
465
+ "speaker_encoder_loss_alpha": 9.0,
466
+ "return_wav": true,
467
+ "use_weighted_sampler": true,
468
+ "weighted_sampler_attrs": {
469
+ "language": 1.0
470
+ },
471
+ "weighted_sampler_multipliers": {},
472
+ "r": 1,
473
+ "num_speakers": 0,
474
+ "use_speaker_embedding": false,
475
+ "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth",
476
+ "speaker_embedding_channels": 256,
477
+ "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json",
478
+ "use_language_embedding": true,
479
+ "use_d_vector_file": true,
480
+ "d_vector_file": [
481
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
482
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
483
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
484
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
485
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
486
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
487
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
488
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
489
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
490
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
491
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
492
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
493
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
494
+ ],
495
+ "d_vector_dim": 512
496
+ }
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bral": 0,
3
+ "bralemanha": 1,
4
+ "brba": 2,
5
+ "brce": 3,
6
+ "brgo": 4,
7
+ "brmg": 5,
8
+ "brpb": 6,
9
+ "brpe": 7,
10
+ "brportugal": 8,
11
+ "brpr": 9,
12
+ "brrj": 10,
13
+ "brrs": 11,
14
+ "brsp": 12
15
+ }
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
3
+ size 3296
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/train_syntacc_baseline.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.bin.compute_embeddings import compute_embeddings
7
+ from TTS.bin.resample import resample_files
8
+ from TTS.config.shared_configs import BaseDatasetConfig
9
+ from TTS.tts.configs.vits_config import VitsConfig
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
12
+ from TTS.utils.downloaders import download_libri_tts
13
+ from torch.utils.data import DataLoader
14
+ from TTS.utils.samplers import PerfectBatchSampler
15
+ torch.set_num_threads(24)
16
+
17
+ # pylint: disable=W0105
18
+ """
19
+ This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
20
+ YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
21
+ """
22
+ CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
23
+
24
+ # Name of the run for the Trainer
25
+ RUN_NAME = "YourTTS-Baseline-PT"
26
+
27
+ # Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
28
+ OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
29
+
30
+ # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
31
+ RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
32
+
33
+ # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
34
+ SKIP_TRAIN_EPOCH = False
35
+
36
+ # Set here the batch size to be used in training and evaluation
37
+ BATCH_SIZE = 26
38
+
39
+ # Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
40
+ # Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
41
+ SAMPLE_RATE = 16000
42
+
43
+
44
+ DASHBOARD_LOGGER="tensorboard"
45
+ LOGGER_URI = None
46
+
47
+ DASHBOARD_LOGGER = "clearml"
48
+ LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
49
+
50
+
51
+
52
+ # Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
53
+ MAX_AUDIO_LEN_IN_SECONDS = float("inf")
54
+
55
+ # Define here the datasets config
56
+ brpb_train_config = BaseDatasetConfig(
57
+ formatter="coqui",
58
+ dataset_name="mupe",
59
+ meta_file_train="metadata_coqui_brpb.csv",
60
+ path="/raid/datasets/MUPE/dataset/mupe/",
61
+ language="brpb"
62
+ )
63
+
64
+ brba_train_config = BaseDatasetConfig(
65
+ formatter="coqui",
66
+ dataset_name="mupe",
67
+ meta_file_train="metadata_coqui_brba.csv",
68
+ path="/raid/datasets/MUPE/dataset/mupe/",
69
+ language="brba"
70
+ )
71
+
72
+ brportugal_train_config = BaseDatasetConfig(
73
+ formatter="coqui",
74
+ dataset_name="mupe",
75
+ meta_file_train="metadata_coqui_brportugal.csv",
76
+ path="/raid/datasets/MUPE/dataset/mupe/",
77
+ language="brportugal"
78
+ )
79
+
80
+ brsp_train_config = BaseDatasetConfig(
81
+ formatter="coqui",
82
+ dataset_name="mupe",
83
+ meta_file_train="metadata_coqui_brsp.csv",
84
+ path="/raid/datasets/MUPE/dataset/mupe/",
85
+ language="brsp"
86
+ )
87
+
88
+ brpe_train_config = BaseDatasetConfig(
89
+ formatter="coqui",
90
+ dataset_name="mupe",
91
+ meta_file_train="metadata_coqui_brpe.csv",
92
+ path="/raid/datasets/MUPE/dataset/mupe/",
93
+ language="brpe"
94
+ )
95
+
96
+ brmg_train_config = BaseDatasetConfig(
97
+ formatter="coqui",
98
+ dataset_name="mupe",
99
+ meta_file_train="metadata_coqui_brmg.csv",
100
+ path="/raid/datasets/MUPE/dataset/mupe/",
101
+ language="brmg"
102
+ )
103
+
104
+ brrj_train_config = BaseDatasetConfig(
105
+ formatter="coqui",
106
+ dataset_name="mupe",
107
+ meta_file_train="metadata_coqui_brrj.csv",
108
+ path="/raid/datasets/MUPE/dataset/mupe/",
109
+ language="brrj"
110
+ )
111
+
112
+ brce_train_config = BaseDatasetConfig(
113
+ formatter="coqui",
114
+ dataset_name="mupe",
115
+ meta_file_train="metadata_coqui_brce.csv",
116
+ path="/raid/datasets/MUPE/dataset/mupe/",
117
+ language="brce"
118
+ )
119
+
120
+ brrs_train_config = BaseDatasetConfig(
121
+ formatter="coqui",
122
+ dataset_name="mupe",
123
+ meta_file_train="metadata_coqui_brrs.csv",
124
+ path="/raid/datasets/MUPE/dataset/mupe/",
125
+ language="brrs"
126
+ )
127
+
128
+ bralemanha_train_config = BaseDatasetConfig(
129
+ formatter="coqui",
130
+ dataset_name="mupe",
131
+ meta_file_train="metadata_coqui_bralemanha.csv",
132
+ path="/raid/datasets/MUPE/dataset/mupe/",
133
+ language="bralemanha"
134
+ )
135
+
136
+ brgo_train_config = BaseDatasetConfig(
137
+ formatter="coqui",
138
+ dataset_name="mupe",
139
+ meta_file_train="metadata_coqui_brgo.csv",
140
+ path="/raid/datasets/MUPE/dataset/mupe/",
141
+ language="brgo"
142
+ )
143
+
144
+ bral_train_config = BaseDatasetConfig(
145
+ formatter="coqui",
146
+ dataset_name="mupe",
147
+ meta_file_train="metadata_coqui_bral.csv",
148
+ path="/raid/datasets/MUPE/dataset/mupe/",
149
+ language="bral"
150
+ )
151
+
152
+ brpr_train_config = BaseDatasetConfig(
153
+ formatter="coqui",
154
+ dataset_name="mupe",
155
+ meta_file_train="metadata_coqui_brpr.csv",
156
+ path="/raid/datasets/MUPE/dataset/mupe/",
157
+ language="brpr"
158
+ )
159
+
160
+ bres_train_config = BaseDatasetConfig(
161
+ formatter="coqui",
162
+ dataset_name="mupe",
163
+ meta_file_train="metadata_coqui_bres.csv",
164
+ path="/raid/datasets/MUPE/dataset/mupe/",
165
+ language="bres"
166
+ )
167
+
168
+ brpi_train_config = BaseDatasetConfig(
169
+ formatter="coqui",
170
+ dataset_name="mupe",
171
+ meta_file_train="metadata_coqui_brpi.csv",
172
+ path="/raid/datasets/MUPE/dataset/mupe/",
173
+ language="brpi"
174
+ )
175
+
176
+ # bres_train_config, brpi_train_config no files found
177
+ DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
178
+
179
+
180
+ ### Extract speaker embeddings
181
+ SPEAKER_ENCODER_CHECKPOINT_PATH = (
182
+ "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
183
+ )
184
+ SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
185
+
186
+ D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
187
+
188
+ # Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
189
+ for dataset_conf in DATASETS_CONFIG_LIST:
190
+ # Check if the embeddings weren't already computed, if not compute it
191
+ embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
192
+ if not os.path.isfile(embeddings_file):
193
+ print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
194
+ compute_embeddings(
195
+ SPEAKER_ENCODER_CHECKPOINT_PATH,
196
+ SPEAKER_ENCODER_CONFIG_PATH,
197
+ embeddings_file,
198
+ old_speakers_file=None,
199
+ config_dataset_path=None,
200
+ formatter_name=dataset_conf.formatter,
201
+ dataset_name=dataset_conf.dataset_name,
202
+ dataset_path=dataset_conf.path,
203
+ meta_file_train=dataset_conf.meta_file_train,
204
+ meta_file_val=dataset_conf.meta_file_val,
205
+ disable_cuda=False,
206
+ no_eval=False,
207
+ )
208
+ D_VECTOR_FILES.append(embeddings_file)
209
+
210
+
211
+ # Audio config used in training.
212
+ audio_config = VitsAudioConfig(
213
+ sample_rate=SAMPLE_RATE,
214
+ hop_length=256,
215
+ win_length=1024,
216
+ fft_size=1024,
217
+ mel_fmin=0.0,
218
+ mel_fmax=None,
219
+ num_mels=80,
220
+ )
221
+
222
+ # Init VITSArgs setting the arguments that are needed for the YourTTS model
223
+ model_args = VitsArgs(
224
+ spec_segment_size=62,
225
+ hidden_channels=192,
226
+ hidden_channels_ffn_text_encoder=768,
227
+ num_heads_text_encoder=2,
228
+ num_layers_text_encoder=10,
229
+ kernel_size_text_encoder=3,
230
+ dropout_p_text_encoder=0.1,
231
+ d_vector_file=D_VECTOR_FILES,
232
+ use_d_vector_file=True,
233
+ d_vector_dim=512,
234
+ speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
235
+ speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
236
+ resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
237
+ # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
238
+ use_speaker_encoder_as_loss=False,
239
+ # Useful parameters to enable multilingual training
240
+ use_language_embedding=True,
241
+ embedded_language_dim=4,
242
+ use_adaptive_weight_text_encoder=False,
243
+ use_perfect_class_batch_sampler=True,
244
+ perfect_class_batch_sampler_key="language"
245
+ )
246
+
247
+ # General training config, here you can change the batch size and others useful parameters
248
+ config = VitsConfig(
249
+ output_path=OUT_PATH,
250
+ model_args=model_args,
251
+ run_name=RUN_NAME,
252
+ project_name="SYNTACC",
253
+ run_description="""
254
+ - YourTTS with SYNTACC text encoder
255
+ """,
256
+ dashboard_logger=DASHBOARD_LOGGER,
257
+ logger_uri=LOGGER_URI,
258
+ audio=audio_config,
259
+ batch_size=BATCH_SIZE,
260
+ batch_group_size=48,
261
+ eval_batch_size=BATCH_SIZE,
262
+ num_loader_workers=8,
263
+ eval_split_max_size=256,
264
+ print_step=50,
265
+ plot_step=100,
266
+ log_model_step=1000,
267
+ save_step=5000,
268
+ save_n_checkpoints=2,
269
+ save_checkpoints=True,
270
+ # target_loss="loss_1",
271
+ print_eval=False,
272
+ use_phonemes=False,
273
+ phonemizer="espeak",
274
+ phoneme_language="en",
275
+ compute_input_seq_cache=True,
276
+ add_blank=True,
277
+ text_cleaner="multilingual_cleaners",
278
+ characters=CharactersConfig(
279
+ characters_class="TTS.tts.models.vits.VitsCharacters",
280
+ pad="_",
281
+ eos="&",
282
+ bos="*",
283
+ blank=None,
284
+ characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
285
+ punctuations="\u2014!'(),-.:;?\u00bf ",
286
+ phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
287
+ is_unique=True,
288
+ is_sorted=True,
289
+ ),
290
+ phoneme_cache_path=None,
291
+ precompute_num_workers=12,
292
+ start_by_longest=True,
293
+ datasets=DATASETS_CONFIG_LIST,
294
+ cudnn_benchmark=False,
295
+ max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
296
+ mixed_precision=False,
297
+ test_sentences=[
298
+ #GUSTAVO: apenas pessoas do treino
299
+ ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
300
+ ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
301
+ ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
302
+ ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
303
+ ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
304
+ ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
305
+ ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
306
+ ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
307
+ ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
308
+ ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
309
+ ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
310
+ ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
311
+ ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
312
+ # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
313
+ # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
314
+ ],
315
+ # Enable the weighted sampler
316
+ use_weighted_sampler=True,
317
+ # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
318
+ # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
319
+ weighted_sampler_attrs={"language": 1.0},
320
+ weighted_sampler_multipliers={
321
+ # "speaker_name": {
322
+ # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
323
+ # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
324
+ # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
325
+ # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
326
+ # }
327
+ },
328
+ # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
329
+ speaker_encoder_loss_alpha=9.0,
330
+ )
331
+
332
+ # Load all the datasets samples and split traning and evaluation sets
333
+ train_samples, eval_samples = load_tts_samples(
334
+ config.datasets,
335
+ eval_split=True,
336
+ eval_split_max_size=config.eval_split_max_size,
337
+ eval_split_size=config.eval_split_size,
338
+ )
339
+
340
+ # Init the model
341
+ model = Vits.init_from_config(config)
342
+
343
+ # Init the trainer and 🚀
344
+ trainer = Trainer(
345
+ TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
346
+ config,
347
+ output_path=OUT_PATH,
348
+ model=model,
349
+ train_samples=train_samples,
350
+ eval_samples=eval_samples,
351
+ )
352
+ trainer.fit()
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/trainer_0_log.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9eb020abfc0ef9798a6097596138d1567d58429ca6c2ce6e59b350acc5301cff
3
+ size 1771305
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a4a050e0d7a9c6c302b70b3f59dc195b12ad8922988de81bae55cbc1a89b9c8
3
+ size 347719275
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model_85001.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a4a050e0d7a9c6c302b70b3f59dc195b12ad8922988de81bae55cbc1a89b9c8
3
+ size 347719275
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/config.json ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/raid/datasets/MUPE/Experiments/runs",
3
+ "logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
4
+ "run_name": "YourTTS-Baseline-PT",
5
+ "project_name": "SYNTACC",
6
+ "run_description": "\n - YourTTS with SYNTACC text encoder\n ",
7
+ "print_step": 50,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "clearml",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": 1000,
14
+ "save_step": 5000,
15
+ "save_n_checkpoints": 2,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 10000,
19
+ "target_loss": null,
20
+ "print_eval": false,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 26,
30
+ "eval_batch_size": 26,
31
+ "grad_clip": [
32
+ 1000,
33
+ 1000
34
+ ],
35
+ "scheduler_after_epoch": true,
36
+ "lr": 0.001,
37
+ "optimizer": "AdamW",
38
+ "optimizer_params": {
39
+ "betas": [
40
+ 0.8,
41
+ 0.99
42
+ ],
43
+ "eps": 1e-09,
44
+ "weight_decay": 0.01
45
+ },
46
+ "lr_scheduler": null,
47
+ "lr_scheduler_params": {},
48
+ "use_grad_scaler": false,
49
+ "allow_tf32": false,
50
+ "cudnn_enable": true,
51
+ "cudnn_deterministic": false,
52
+ "cudnn_benchmark": false,
53
+ "training_seed": 54321,
54
+ "model": "vits",
55
+ "num_loader_workers": 8,
56
+ "num_eval_loader_workers": 0,
57
+ "use_noise_augment": false,
58
+ "audio": {
59
+ "fft_size": 1024,
60
+ "sample_rate": 16000,
61
+ "win_length": 1024,
62
+ "hop_length": 256,
63
+ "num_mels": 80,
64
+ "mel_fmin": 0.0,
65
+ "mel_fmax": null
66
+ },
67
+ "use_phonemes": false,
68
+ "phonemizer": "espeak",
69
+ "phoneme_language": "en",
70
+ "compute_input_seq_cache": true,
71
+ "text_cleaner": "multilingual_cleaners",
72
+ "enable_eos_bos_chars": false,
73
+ "test_sentences_file": "",
74
+ "phoneme_cache_path": null,
75
+ "characters": {
76
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
77
+ "vocab_dict": null,
78
+ "pad": "_",
79
+ "eos": "&",
80
+ "bos": "*",
81
+ "blank": null,
82
+ "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
83
+ "punctuations": "\u2014!'(),-.:;?\u00bf ",
84
+ "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
85
+ "is_unique": true,
86
+ "is_sorted": true
87
+ },
88
+ "add_blank": true,
89
+ "batch_group_size": 48,
90
+ "loss_masking": null,
91
+ "min_audio_len": 1,
92
+ "max_audio_len": Infinity,
93
+ "min_text_len": 1,
94
+ "max_text_len": Infinity,
95
+ "compute_f0": false,
96
+ "compute_energy": false,
97
+ "compute_linear_spec": true,
98
+ "precompute_num_workers": 12,
99
+ "start_by_longest": true,
100
+ "shuffle": false,
101
+ "drop_last": false,
102
+ "datasets": [
103
+ {
104
+ "formatter": "coqui",
105
+ "dataset_name": "mupe",
106
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
107
+ "meta_file_train": "metadata_coqui_brpb.csv",
108
+ "ignored_speakers": null,
109
+ "language": "brpb",
110
+ "phonemizer": "",
111
+ "meta_file_val": "",
112
+ "meta_file_attn_mask": ""
113
+ },
114
+ {
115
+ "formatter": "coqui",
116
+ "dataset_name": "mupe",
117
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
118
+ "meta_file_train": "metadata_coqui_brba.csv",
119
+ "ignored_speakers": null,
120
+ "language": "brba",
121
+ "phonemizer": "",
122
+ "meta_file_val": "",
123
+ "meta_file_attn_mask": ""
124
+ },
125
+ {
126
+ "formatter": "coqui",
127
+ "dataset_name": "mupe",
128
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
129
+ "meta_file_train": "metadata_coqui_brportugal.csv",
130
+ "ignored_speakers": null,
131
+ "language": "brportugal",
132
+ "phonemizer": "",
133
+ "meta_file_val": "",
134
+ "meta_file_attn_mask": ""
135
+ },
136
+ {
137
+ "formatter": "coqui",
138
+ "dataset_name": "mupe",
139
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
140
+ "meta_file_train": "metadata_coqui_brsp.csv",
141
+ "ignored_speakers": null,
142
+ "language": "brsp",
143
+ "phonemizer": "",
144
+ "meta_file_val": "",
145
+ "meta_file_attn_mask": ""
146
+ },
147
+ {
148
+ "formatter": "coqui",
149
+ "dataset_name": "mupe",
150
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
151
+ "meta_file_train": "metadata_coqui_brpe.csv",
152
+ "ignored_speakers": null,
153
+ "language": "brpe",
154
+ "phonemizer": "",
155
+ "meta_file_val": "",
156
+ "meta_file_attn_mask": ""
157
+ },
158
+ {
159
+ "formatter": "coqui",
160
+ "dataset_name": "mupe",
161
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
162
+ "meta_file_train": "metadata_coqui_brmg.csv",
163
+ "ignored_speakers": null,
164
+ "language": "brmg",
165
+ "phonemizer": "",
166
+ "meta_file_val": "",
167
+ "meta_file_attn_mask": ""
168
+ },
169
+ {
170
+ "formatter": "coqui",
171
+ "dataset_name": "mupe",
172
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
173
+ "meta_file_train": "metadata_coqui_brrj.csv",
174
+ "ignored_speakers": null,
175
+ "language": "brrj",
176
+ "phonemizer": "",
177
+ "meta_file_val": "",
178
+ "meta_file_attn_mask": ""
179
+ },
180
+ {
181
+ "formatter": "coqui",
182
+ "dataset_name": "mupe",
183
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
184
+ "meta_file_train": "metadata_coqui_brce.csv",
185
+ "ignored_speakers": null,
186
+ "language": "brce",
187
+ "phonemizer": "",
188
+ "meta_file_val": "",
189
+ "meta_file_attn_mask": ""
190
+ },
191
+ {
192
+ "formatter": "coqui",
193
+ "dataset_name": "mupe",
194
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
195
+ "meta_file_train": "metadata_coqui_brrs.csv",
196
+ "ignored_speakers": null,
197
+ "language": "brrs",
198
+ "phonemizer": "",
199
+ "meta_file_val": "",
200
+ "meta_file_attn_mask": ""
201
+ },
202
+ {
203
+ "formatter": "coqui",
204
+ "dataset_name": "mupe",
205
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
206
+ "meta_file_train": "metadata_coqui_bralemanha.csv",
207
+ "ignored_speakers": null,
208
+ "language": "bralemanha",
209
+ "phonemizer": "",
210
+ "meta_file_val": "",
211
+ "meta_file_attn_mask": ""
212
+ },
213
+ {
214
+ "formatter": "coqui",
215
+ "dataset_name": "mupe",
216
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
217
+ "meta_file_train": "metadata_coqui_brgo.csv",
218
+ "ignored_speakers": null,
219
+ "language": "brgo",
220
+ "phonemizer": "",
221
+ "meta_file_val": "",
222
+ "meta_file_attn_mask": ""
223
+ },
224
+ {
225
+ "formatter": "coqui",
226
+ "dataset_name": "mupe",
227
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
228
+ "meta_file_train": "metadata_coqui_bral.csv",
229
+ "ignored_speakers": null,
230
+ "language": "bral",
231
+ "phonemizer": "",
232
+ "meta_file_val": "",
233
+ "meta_file_attn_mask": ""
234
+ },
235
+ {
236
+ "formatter": "coqui",
237
+ "dataset_name": "mupe",
238
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
239
+ "meta_file_train": "metadata_coqui_brpr.csv",
240
+ "ignored_speakers": null,
241
+ "language": "brpr",
242
+ "phonemizer": "",
243
+ "meta_file_val": "",
244
+ "meta_file_attn_mask": ""
245
+ }
246
+ ],
247
+ "test_sentences": [
248
+ [
249
+ "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
250
+ "EDILEINE_FONSECA",
251
+ null,
252
+ "brsp"
253
+ ],
254
+ [
255
+ "Quem semeia ventos, colhe tempestades.",
256
+ "JOSE_PAULO_DE_ARAUJO",
257
+ null,
258
+ "brpb"
259
+ ],
260
+ [
261
+ "O olho do dono \u00e9 que engorda o gado.",
262
+ "VITOR_RAFAEL_OLIVEIRA_ALVES",
263
+ null,
264
+ "brba"
265
+ ],
266
+ [
267
+ "\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
268
+ "MARIA_AURORA_FELIX",
269
+ null,
270
+ "brportugal"
271
+ ],
272
+ [
273
+ "Quem espera sempre alcan\u00e7a.",
274
+ "ANTONIO_DE_AMORIM_COSTA",
275
+ null,
276
+ "brpe"
277
+ ],
278
+ [
279
+ "Cada macaco no seu galho.",
280
+ "ALCIDES_DE_LIMA",
281
+ null,
282
+ "brmg"
283
+ ],
284
+ [
285
+ "Em terra de cego, quem tem um olho \u00e9 rei.",
286
+ "ALUISIO_SOARES_DE_SOUSA",
287
+ null,
288
+ "brrj"
289
+ ],
290
+ [
291
+ "A ocasi\u00e3o faz o ladr\u00e3o.",
292
+ "FRANCISCO_JOSE_MOREIRA_MOTA",
293
+ null,
294
+ "brce"
295
+ ],
296
+ [
297
+ "De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
298
+ "EVALDO_ANDRADA_CORREA",
299
+ null,
300
+ "brrs"
301
+ ],
302
+ [
303
+ "Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
304
+ "DORIS_ALEXANDER",
305
+ null,
306
+ "bralemanha"
307
+ ],
308
+ [
309
+ "Quem n\u00e3o arrisca, n\u00e3o petisca.",
310
+ "DONALDO_LUIZ_DE_ALMEIDA",
311
+ null,
312
+ "brgo"
313
+ ],
314
+ [
315
+ "A uni\u00e3o faz a for\u00e7a.",
316
+ "GERONCIO_HENRIQUE_NETO",
317
+ null,
318
+ "bral"
319
+ ],
320
+ [
321
+ "Em boca fechada n\u00e3o entra mosquito.",
322
+ "MALU_NATEL_FREIRE_WEBER",
323
+ null,
324
+ "brpr"
325
+ ]
326
+ ],
327
+ "eval_split_max_size": 256,
328
+ "eval_split_size": 0.01,
329
+ "use_speaker_weighted_sampler": false,
330
+ "speaker_weighted_sampler_alpha": 1.0,
331
+ "use_language_weighted_sampler": false,
332
+ "language_weighted_sampler_alpha": 1.0,
333
+ "use_length_weighted_sampler": false,
334
+ "length_weighted_sampler_alpha": 1.0,
335
+ "model_args": {
336
+ "num_chars": 266,
337
+ "out_channels": 513,
338
+ "spec_segment_size": 62,
339
+ "hidden_channels": 192,
340
+ "use_adaptive_weight_text_encoder": false,
341
+ "use_perfect_class_batch_sampler": true,
342
+ "perfect_class_batch_sampler_key": "language",
343
+ "hidden_channels_ffn_text_encoder": 768,
344
+ "num_heads_text_encoder": 2,
345
+ "num_layers_text_encoder": 10,
346
+ "kernel_size_text_encoder": 3,
347
+ "dropout_p_text_encoder": 0.1,
348
+ "dropout_p_duration_predictor": 0.5,
349
+ "kernel_size_posterior_encoder": 5,
350
+ "dilation_rate_posterior_encoder": 1,
351
+ "num_layers_posterior_encoder": 16,
352
+ "kernel_size_flow": 5,
353
+ "dilation_rate_flow": 1,
354
+ "num_layers_flow": 4,
355
+ "resblock_type_decoder": "2",
356
+ "resblock_kernel_sizes_decoder": [
357
+ 3,
358
+ 7,
359
+ 11
360
+ ],
361
+ "resblock_dilation_sizes_decoder": [
362
+ [
363
+ 1,
364
+ 3,
365
+ 5
366
+ ],
367
+ [
368
+ 1,
369
+ 3,
370
+ 5
371
+ ],
372
+ [
373
+ 1,
374
+ 3,
375
+ 5
376
+ ]
377
+ ],
378
+ "upsample_rates_decoder": [
379
+ 8,
380
+ 8,
381
+ 2,
382
+ 2
383
+ ],
384
+ "upsample_initial_channel_decoder": 512,
385
+ "upsample_kernel_sizes_decoder": [
386
+ 16,
387
+ 16,
388
+ 4,
389
+ 4
390
+ ],
391
+ "periods_multi_period_discriminator": [
392
+ 2,
393
+ 3,
394
+ 5,
395
+ 7,
396
+ 11
397
+ ],
398
+ "use_sdp": true,
399
+ "noise_scale": 1.0,
400
+ "inference_noise_scale": 0.667,
401
+ "length_scale": 1,
402
+ "noise_scale_dp": 1.0,
403
+ "inference_noise_scale_dp": 1.0,
404
+ "max_inference_len": null,
405
+ "init_discriminator": true,
406
+ "use_spectral_norm_disriminator": false,
407
+ "use_speaker_embedding": false,
408
+ "num_speakers": 0,
409
+ "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth",
410
+ "d_vector_file": [
411
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
412
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
413
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
414
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
415
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
416
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
417
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
418
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
419
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
420
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
421
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
422
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
423
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
424
+ ],
425
+ "speaker_embedding_channels": 256,
426
+ "use_d_vector_file": true,
427
+ "d_vector_dim": 512,
428
+ "detach_dp_input": true,
429
+ "use_language_embedding": true,
430
+ "embedded_language_dim": 4,
431
+ "num_languages": 0,
432
+ "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json",
433
+ "use_speaker_encoder_as_loss": false,
434
+ "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
435
+ "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
436
+ "condition_dp_on_speaker": true,
437
+ "freeze_encoder": false,
438
+ "freeze_DP": false,
439
+ "freeze_PE": false,
440
+ "freeze_flow_decoder": false,
441
+ "freeze_waveform_decoder": false,
442
+ "encoder_sample_rate": null,
443
+ "interpolate_z": true,
444
+ "reinit_DP": false,
445
+ "reinit_text_encoder": false
446
+ },
447
+ "lr_gen": 0.0002,
448
+ "lr_disc": 0.0002,
449
+ "lr_scheduler_gen": "ExponentialLR",
450
+ "lr_scheduler_gen_params": {
451
+ "gamma": 0.999875,
452
+ "last_epoch": -1
453
+ },
454
+ "lr_scheduler_disc": "ExponentialLR",
455
+ "lr_scheduler_disc_params": {
456
+ "gamma": 0.999875,
457
+ "last_epoch": -1
458
+ },
459
+ "kl_loss_alpha": 1.0,
460
+ "disc_loss_alpha": 1.0,
461
+ "gen_loss_alpha": 1.0,
462
+ "feat_loss_alpha": 1.0,
463
+ "mel_loss_alpha": 45.0,
464
+ "dur_loss_alpha": 1.0,
465
+ "speaker_encoder_loss_alpha": 9.0,
466
+ "return_wav": true,
467
+ "use_weighted_sampler": true,
468
+ "weighted_sampler_attrs": {
469
+ "language": 1.0
470
+ },
471
+ "weighted_sampler_multipliers": {},
472
+ "r": 1,
473
+ "num_speakers": 0,
474
+ "use_speaker_embedding": false,
475
+ "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth",
476
+ "speaker_embedding_channels": 256,
477
+ "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json",
478
+ "use_language_embedding": true,
479
+ "use_d_vector_file": true,
480
+ "d_vector_file": [
481
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
482
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
483
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
484
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
485
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
486
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
487
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
488
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
489
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
490
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
491
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
492
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
493
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
494
+ ],
495
+ "d_vector_dim": 512
496
+ }
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bral": 0,
3
+ "bralemanha": 1,
4
+ "brba": 2,
5
+ "brce": 3,
6
+ "brgo": 4,
7
+ "brmg": 5,
8
+ "brpb": 6,
9
+ "brpe": 7,
10
+ "brportugal": 8,
11
+ "brpr": 9,
12
+ "brrj": 10,
13
+ "brrs": 11,
14
+ "brsp": 12
15
+ }
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
3
+ size 3296
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/train_syntacc_baseline.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.bin.compute_embeddings import compute_embeddings
7
+ from TTS.bin.resample import resample_files
8
+ from TTS.config.shared_configs import BaseDatasetConfig
9
+ from TTS.tts.configs.vits_config import VitsConfig
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
12
+ from TTS.utils.downloaders import download_libri_tts
13
+ from torch.utils.data import DataLoader
14
+ from TTS.utils.samplers import PerfectBatchSampler
15
+ torch.set_num_threads(24)
16
+
17
+ # pylint: disable=W0105
18
+ """
19
+ This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
20
+ YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
21
+ """
22
+ CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
23
+
24
+ # Name of the run for the Trainer
25
+ RUN_NAME = "YourTTS-Baseline-PT"
26
+
27
+ # Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
28
+ OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
29
+
30
+ # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
31
+ RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
32
+
33
+ # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
34
+ SKIP_TRAIN_EPOCH = False
35
+
36
+ # Set here the batch size to be used in training and evaluation
37
+ BATCH_SIZE = 26
38
+
39
+ # Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
40
+ # Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
41
+ SAMPLE_RATE = 16000
42
+
43
+
44
+ DASHBOARD_LOGGER="tensorboard"
45
+ LOGGER_URI = None
46
+
47
+ DASHBOARD_LOGGER = "clearml"
48
+ LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
49
+
50
+
51
+
52
+ # Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
53
+ MAX_AUDIO_LEN_IN_SECONDS = float("inf")
54
+
55
+ # Define here the datasets config
56
+ brpb_train_config = BaseDatasetConfig(
57
+ formatter="coqui",
58
+ dataset_name="mupe",
59
+ meta_file_train="metadata_coqui_brpb.csv",
60
+ path="/raid/datasets/MUPE/dataset/mupe/",
61
+ language="brpb"
62
+ )
63
+
64
+ brba_train_config = BaseDatasetConfig(
65
+ formatter="coqui",
66
+ dataset_name="mupe",
67
+ meta_file_train="metadata_coqui_brba.csv",
68
+ path="/raid/datasets/MUPE/dataset/mupe/",
69
+ language="brba"
70
+ )
71
+
72
+ brportugal_train_config = BaseDatasetConfig(
73
+ formatter="coqui",
74
+ dataset_name="mupe",
75
+ meta_file_train="metadata_coqui_brportugal.csv",
76
+ path="/raid/datasets/MUPE/dataset/mupe/",
77
+ language="brportugal"
78
+ )
79
+
80
+ brsp_train_config = BaseDatasetConfig(
81
+ formatter="coqui",
82
+ dataset_name="mupe",
83
+ meta_file_train="metadata_coqui_brsp.csv",
84
+ path="/raid/datasets/MUPE/dataset/mupe/",
85
+ language="brsp"
86
+ )
87
+
88
+ brpe_train_config = BaseDatasetConfig(
89
+ formatter="coqui",
90
+ dataset_name="mupe",
91
+ meta_file_train="metadata_coqui_brpe.csv",
92
+ path="/raid/datasets/MUPE/dataset/mupe/",
93
+ language="brpe"
94
+ )
95
+
96
+ brmg_train_config = BaseDatasetConfig(
97
+ formatter="coqui",
98
+ dataset_name="mupe",
99
+ meta_file_train="metadata_coqui_brmg.csv",
100
+ path="/raid/datasets/MUPE/dataset/mupe/",
101
+ language="brmg"
102
+ )
103
+
104
+ brrj_train_config = BaseDatasetConfig(
105
+ formatter="coqui",
106
+ dataset_name="mupe",
107
+ meta_file_train="metadata_coqui_brrj.csv",
108
+ path="/raid/datasets/MUPE/dataset/mupe/",
109
+ language="brrj"
110
+ )
111
+
112
+ brce_train_config = BaseDatasetConfig(
113
+ formatter="coqui",
114
+ dataset_name="mupe",
115
+ meta_file_train="metadata_coqui_brce.csv",
116
+ path="/raid/datasets/MUPE/dataset/mupe/",
117
+ language="brce"
118
+ )
119
+
120
+ brrs_train_config = BaseDatasetConfig(
121
+ formatter="coqui",
122
+ dataset_name="mupe",
123
+ meta_file_train="metadata_coqui_brrs.csv",
124
+ path="/raid/datasets/MUPE/dataset/mupe/",
125
+ language="brrs"
126
+ )
127
+
128
+ bralemanha_train_config = BaseDatasetConfig(
129
+ formatter="coqui",
130
+ dataset_name="mupe",
131
+ meta_file_train="metadata_coqui_bralemanha.csv",
132
+ path="/raid/datasets/MUPE/dataset/mupe/",
133
+ language="bralemanha"
134
+ )
135
+
136
+ brgo_train_config = BaseDatasetConfig(
137
+ formatter="coqui",
138
+ dataset_name="mupe",
139
+ meta_file_train="metadata_coqui_brgo.csv",
140
+ path="/raid/datasets/MUPE/dataset/mupe/",
141
+ language="brgo"
142
+ )
143
+
144
+ bral_train_config = BaseDatasetConfig(
145
+ formatter="coqui",
146
+ dataset_name="mupe",
147
+ meta_file_train="metadata_coqui_bral.csv",
148
+ path="/raid/datasets/MUPE/dataset/mupe/",
149
+ language="bral"
150
+ )
151
+
152
+ brpr_train_config = BaseDatasetConfig(
153
+ formatter="coqui",
154
+ dataset_name="mupe",
155
+ meta_file_train="metadata_coqui_brpr.csv",
156
+ path="/raid/datasets/MUPE/dataset/mupe/",
157
+ language="brpr"
158
+ )
159
+
160
+ bres_train_config = BaseDatasetConfig(
161
+ formatter="coqui",
162
+ dataset_name="mupe",
163
+ meta_file_train="metadata_coqui_bres.csv",
164
+ path="/raid/datasets/MUPE/dataset/mupe/",
165
+ language="bres"
166
+ )
167
+
168
+ brpi_train_config = BaseDatasetConfig(
169
+ formatter="coqui",
170
+ dataset_name="mupe",
171
+ meta_file_train="metadata_coqui_brpi.csv",
172
+ path="/raid/datasets/MUPE/dataset/mupe/",
173
+ language="brpi"
174
+ )
175
+
176
+ # bres_train_config, brpi_train_config no files found
177
+ DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
178
+
179
+
180
+ ### Extract speaker embeddings
181
+ SPEAKER_ENCODER_CHECKPOINT_PATH = (
182
+ "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
183
+ )
184
+ SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
185
+
186
+ D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
187
+
188
+ # Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
189
+ for dataset_conf in DATASETS_CONFIG_LIST:
190
+ # Check if the embeddings weren't already computed, if not compute it
191
+ embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
192
+ if not os.path.isfile(embeddings_file):
193
+ print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
194
+ compute_embeddings(
195
+ SPEAKER_ENCODER_CHECKPOINT_PATH,
196
+ SPEAKER_ENCODER_CONFIG_PATH,
197
+ embeddings_file,
198
+ old_speakers_file=None,
199
+ config_dataset_path=None,
200
+ formatter_name=dataset_conf.formatter,
201
+ dataset_name=dataset_conf.dataset_name,
202
+ dataset_path=dataset_conf.path,
203
+ meta_file_train=dataset_conf.meta_file_train,
204
+ meta_file_val=dataset_conf.meta_file_val,
205
+ disable_cuda=False,
206
+ no_eval=False,
207
+ )
208
+ D_VECTOR_FILES.append(embeddings_file)
209
+
210
+
211
+ # Audio config used in training.
212
+ audio_config = VitsAudioConfig(
213
+ sample_rate=SAMPLE_RATE,
214
+ hop_length=256,
215
+ win_length=1024,
216
+ fft_size=1024,
217
+ mel_fmin=0.0,
218
+ mel_fmax=None,
219
+ num_mels=80,
220
+ )
221
+
222
+ # Init VITSArgs setting the arguments that are needed for the YourTTS model
223
+ model_args = VitsArgs(
224
+ spec_segment_size=62,
225
+ hidden_channels=192,
226
+ hidden_channels_ffn_text_encoder=768,
227
+ num_heads_text_encoder=2,
228
+ num_layers_text_encoder=10,
229
+ kernel_size_text_encoder=3,
230
+ dropout_p_text_encoder=0.1,
231
+ d_vector_file=D_VECTOR_FILES,
232
+ use_d_vector_file=True,
233
+ d_vector_dim=512,
234
+ speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
235
+ speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
236
+ resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
237
+ # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
238
+ use_speaker_encoder_as_loss=False,
239
+ # Useful parameters to enable multilingual training
240
+ use_language_embedding=True,
241
+ embedded_language_dim=4,
242
+ use_adaptive_weight_text_encoder=False,
243
+ use_perfect_class_batch_sampler=True,
244
+ perfect_class_batch_sampler_key="language"
245
+ )
246
+
247
+ # General training config, here you can change the batch size and others useful parameters
248
+ config = VitsConfig(
249
+ output_path=OUT_PATH,
250
+ model_args=model_args,
251
+ run_name=RUN_NAME,
252
+ project_name="SYNTACC",
253
+ run_description="""
254
+ - YourTTS with SYNTACC text encoder
255
+ """,
256
+ dashboard_logger=DASHBOARD_LOGGER,
257
+ logger_uri=LOGGER_URI,
258
+ audio=audio_config,
259
+ batch_size=BATCH_SIZE,
260
+ batch_group_size=48,
261
+ eval_batch_size=BATCH_SIZE,
262
+ num_loader_workers=8,
263
+ eval_split_max_size=256,
264
+ print_step=50,
265
+ plot_step=100,
266
+ log_model_step=1000,
267
+ save_step=5000,
268
+ save_n_checkpoints=2,
269
+ save_checkpoints=True,
270
+ # target_loss="loss_1",
271
+ print_eval=False,
272
+ use_phonemes=False,
273
+ phonemizer="espeak",
274
+ phoneme_language="en",
275
+ compute_input_seq_cache=True,
276
+ add_blank=True,
277
+ text_cleaner="multilingual_cleaners",
278
+ characters=CharactersConfig(
279
+ characters_class="TTS.tts.models.vits.VitsCharacters",
280
+ pad="_",
281
+ eos="&",
282
+ bos="*",
283
+ blank=None,
284
+ characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
285
+ punctuations="\u2014!'(),-.:;?\u00bf ",
286
+ phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
287
+ is_unique=True,
288
+ is_sorted=True,
289
+ ),
290
+ phoneme_cache_path=None,
291
+ precompute_num_workers=12,
292
+ start_by_longest=True,
293
+ datasets=DATASETS_CONFIG_LIST,
294
+ cudnn_benchmark=False,
295
+ max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
296
+ mixed_precision=False,
297
+ test_sentences=[
298
+ #GUSTAVO: apenas pessoas do treino
299
+ ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
300
+ ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
301
+ ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
302
+ ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
303
+ ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
304
+ ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
305
+ ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
306
+ ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
307
+ ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
308
+ ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
309
+ ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
310
+ ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
311
+ ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
312
+ # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
313
+ # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
314
+ ],
315
+ # Enable the weighted sampler
316
+ use_weighted_sampler=True,
317
+ # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
318
+ # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
319
+ weighted_sampler_attrs={"language": 1.0},
320
+ weighted_sampler_multipliers={
321
+ # "speaker_name": {
322
+ # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
323
+ # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
324
+ # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
325
+ # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
326
+ # }
327
+ },
328
+ # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
329
+ speaker_encoder_loss_alpha=9.0,
330
+ )
331
+
332
+ # Load all the datasets samples and split traning and evaluation sets
333
+ train_samples, eval_samples = load_tts_samples(
334
+ config.datasets,
335
+ eval_split=True,
336
+ eval_split_max_size=config.eval_split_max_size,
337
+ eval_split_size=config.eval_split_size,
338
+ )
339
+
340
+ # Init the model
341
+ model = Vits.init_from_config(config)
342
+
343
+ # Init the trainer and 🚀
344
+ trainer = Trainer(
345
+ TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
346
+ config,
347
+ output_path=OUT_PATH,
348
+ model=model,
349
+ train_samples=train_samples,
350
+ eval_samples=eval_samples,
351
+ )
352
+ trainer.fit()
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/trainer_0_log.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94c095ee47fd6e763ee0e129a7728cf80e5e4f21301e767ab0141c478d369b89
3
+ size 128993
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a082ddde12d21020f66a70cf05a74826488d10008a8379b699458d92509e85d1
3
+ size 1043216142
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model_87192.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a082ddde12d21020f66a70cf05a74826488d10008a8379b699458d92509e85d1
3
+ size 1043216142
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_130000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a584eb832a857f9a11180b34a84b81117d8690ed1e5fa39e4ff711cf6ffd7f7
3
+ size 1043220766
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_135000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:367ac46477805942658a7a78e8cf473409537967f9382a46249a8d11521ed3f9
3
+ size 1043220766
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/config.json ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/raid/datasets/MUPE/Experiments/runs",
3
+ "logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
4
+ "run_name": "YourTTS-Baseline-PT",
5
+ "project_name": "SYNTACC",
6
+ "run_description": "\n - YourTTS with SYNTACC text encoder\n ",
7
+ "print_step": 50,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "clearml",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": 1000,
14
+ "save_step": 5000,
15
+ "save_n_checkpoints": 2,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 10000,
19
+ "target_loss": null,
20
+ "print_eval": false,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 26,
30
+ "eval_batch_size": 26,
31
+ "grad_clip": [
32
+ 1000,
33
+ 1000
34
+ ],
35
+ "scheduler_after_epoch": true,
36
+ "lr": 0.001,
37
+ "optimizer": "AdamW",
38
+ "optimizer_params": {
39
+ "betas": [
40
+ 0.8,
41
+ 0.99
42
+ ],
43
+ "eps": 1e-09,
44
+ "weight_decay": 0.01
45
+ },
46
+ "lr_scheduler": null,
47
+ "lr_scheduler_params": {},
48
+ "use_grad_scaler": false,
49
+ "allow_tf32": false,
50
+ "cudnn_enable": true,
51
+ "cudnn_deterministic": false,
52
+ "cudnn_benchmark": false,
53
+ "training_seed": 54321,
54
+ "model": "vits",
55
+ "num_loader_workers": 8,
56
+ "num_eval_loader_workers": 0,
57
+ "use_noise_augment": false,
58
+ "audio": {
59
+ "fft_size": 1024,
60
+ "sample_rate": 16000,
61
+ "win_length": 1024,
62
+ "hop_length": 256,
63
+ "num_mels": 80,
64
+ "mel_fmin": 0.0,
65
+ "mel_fmax": null
66
+ },
67
+ "use_phonemes": false,
68
+ "phonemizer": "espeak",
69
+ "phoneme_language": "en",
70
+ "compute_input_seq_cache": true,
71
+ "text_cleaner": "multilingual_cleaners",
72
+ "enable_eos_bos_chars": false,
73
+ "test_sentences_file": "",
74
+ "phoneme_cache_path": null,
75
+ "characters": {
76
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
77
+ "vocab_dict": null,
78
+ "pad": "_",
79
+ "eos": "&",
80
+ "bos": "*",
81
+ "blank": null,
82
+ "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
83
+ "punctuations": "\u2014!'(),-.:;?\u00bf ",
84
+ "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
85
+ "is_unique": true,
86
+ "is_sorted": true
87
+ },
88
+ "add_blank": true,
89
+ "batch_group_size": 48,
90
+ "loss_masking": null,
91
+ "min_audio_len": 1,
92
+ "max_audio_len": Infinity,
93
+ "min_text_len": 1,
94
+ "max_text_len": Infinity,
95
+ "compute_f0": false,
96
+ "compute_energy": false,
97
+ "compute_linear_spec": true,
98
+ "precompute_num_workers": 12,
99
+ "start_by_longest": true,
100
+ "shuffle": false,
101
+ "drop_last": false,
102
+ "datasets": [
103
+ {
104
+ "formatter": "coqui",
105
+ "dataset_name": "mupe",
106
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
107
+ "meta_file_train": "metadata_coqui_brpb.csv",
108
+ "ignored_speakers": null,
109
+ "language": "brpb",
110
+ "phonemizer": "",
111
+ "meta_file_val": "",
112
+ "meta_file_attn_mask": ""
113
+ },
114
+ {
115
+ "formatter": "coqui",
116
+ "dataset_name": "mupe",
117
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
118
+ "meta_file_train": "metadata_coqui_brba.csv",
119
+ "ignored_speakers": null,
120
+ "language": "brba",
121
+ "phonemizer": "",
122
+ "meta_file_val": "",
123
+ "meta_file_attn_mask": ""
124
+ },
125
+ {
126
+ "formatter": "coqui",
127
+ "dataset_name": "mupe",
128
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
129
+ "meta_file_train": "metadata_coqui_brportugal.csv",
130
+ "ignored_speakers": null,
131
+ "language": "brportugal",
132
+ "phonemizer": "",
133
+ "meta_file_val": "",
134
+ "meta_file_attn_mask": ""
135
+ },
136
+ {
137
+ "formatter": "coqui",
138
+ "dataset_name": "mupe",
139
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
140
+ "meta_file_train": "metadata_coqui_brsp.csv",
141
+ "ignored_speakers": null,
142
+ "language": "brsp",
143
+ "phonemizer": "",
144
+ "meta_file_val": "",
145
+ "meta_file_attn_mask": ""
146
+ },
147
+ {
148
+ "formatter": "coqui",
149
+ "dataset_name": "mupe",
150
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
151
+ "meta_file_train": "metadata_coqui_brpe.csv",
152
+ "ignored_speakers": null,
153
+ "language": "brpe",
154
+ "phonemizer": "",
155
+ "meta_file_val": "",
156
+ "meta_file_attn_mask": ""
157
+ },
158
+ {
159
+ "formatter": "coqui",
160
+ "dataset_name": "mupe",
161
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
162
+ "meta_file_train": "metadata_coqui_brmg.csv",
163
+ "ignored_speakers": null,
164
+ "language": "brmg",
165
+ "phonemizer": "",
166
+ "meta_file_val": "",
167
+ "meta_file_attn_mask": ""
168
+ },
169
+ {
170
+ "formatter": "coqui",
171
+ "dataset_name": "mupe",
172
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
173
+ "meta_file_train": "metadata_coqui_brrj.csv",
174
+ "ignored_speakers": null,
175
+ "language": "brrj",
176
+ "phonemizer": "",
177
+ "meta_file_val": "",
178
+ "meta_file_attn_mask": ""
179
+ },
180
+ {
181
+ "formatter": "coqui",
182
+ "dataset_name": "mupe",
183
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
184
+ "meta_file_train": "metadata_coqui_brce.csv",
185
+ "ignored_speakers": null,
186
+ "language": "brce",
187
+ "phonemizer": "",
188
+ "meta_file_val": "",
189
+ "meta_file_attn_mask": ""
190
+ },
191
+ {
192
+ "formatter": "coqui",
193
+ "dataset_name": "mupe",
194
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
195
+ "meta_file_train": "metadata_coqui_brrs.csv",
196
+ "ignored_speakers": null,
197
+ "language": "brrs",
198
+ "phonemizer": "",
199
+ "meta_file_val": "",
200
+ "meta_file_attn_mask": ""
201
+ },
202
+ {
203
+ "formatter": "coqui",
204
+ "dataset_name": "mupe",
205
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
206
+ "meta_file_train": "metadata_coqui_bralemanha.csv",
207
+ "ignored_speakers": null,
208
+ "language": "bralemanha",
209
+ "phonemizer": "",
210
+ "meta_file_val": "",
211
+ "meta_file_attn_mask": ""
212
+ },
213
+ {
214
+ "formatter": "coqui",
215
+ "dataset_name": "mupe",
216
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
217
+ "meta_file_train": "metadata_coqui_brgo.csv",
218
+ "ignored_speakers": null,
219
+ "language": "brgo",
220
+ "phonemizer": "",
221
+ "meta_file_val": "",
222
+ "meta_file_attn_mask": ""
223
+ },
224
+ {
225
+ "formatter": "coqui",
226
+ "dataset_name": "mupe",
227
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
228
+ "meta_file_train": "metadata_coqui_bral.csv",
229
+ "ignored_speakers": null,
230
+ "language": "bral",
231
+ "phonemizer": "",
232
+ "meta_file_val": "",
233
+ "meta_file_attn_mask": ""
234
+ },
235
+ {
236
+ "formatter": "coqui",
237
+ "dataset_name": "mupe",
238
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
239
+ "meta_file_train": "metadata_coqui_brpr.csv",
240
+ "ignored_speakers": null,
241
+ "language": "brpr",
242
+ "phonemizer": "",
243
+ "meta_file_val": "",
244
+ "meta_file_attn_mask": ""
245
+ }
246
+ ],
247
+ "test_sentences": [
248
+ [
249
+ "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
250
+ "EDILEINE_FONSECA",
251
+ null,
252
+ "brsp"
253
+ ],
254
+ [
255
+ "Quem semeia ventos, colhe tempestades.",
256
+ "JOSE_PAULO_DE_ARAUJO",
257
+ null,
258
+ "brpb"
259
+ ],
260
+ [
261
+ "O olho do dono \u00e9 que engorda o gado.",
262
+ "VITOR_RAFAEL_OLIVEIRA_ALVES",
263
+ null,
264
+ "brba"
265
+ ],
266
+ [
267
+ "\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
268
+ "MARIA_AURORA_FELIX",
269
+ null,
270
+ "brportugal"
271
+ ],
272
+ [
273
+ "Quem espera sempre alcan\u00e7a.",
274
+ "ANTONIO_DE_AMORIM_COSTA",
275
+ null,
276
+ "brpe"
277
+ ],
278
+ [
279
+ "Cada macaco no seu galho.",
280
+ "ALCIDES_DE_LIMA",
281
+ null,
282
+ "brmg"
283
+ ],
284
+ [
285
+ "Em terra de cego, quem tem um olho \u00e9 rei.",
286
+ "ALUISIO_SOARES_DE_SOUSA",
287
+ null,
288
+ "brrj"
289
+ ],
290
+ [
291
+ "A ocasi\u00e3o faz o ladr\u00e3o.",
292
+ "FRANCISCO_JOSE_MOREIRA_MOTA",
293
+ null,
294
+ "brce"
295
+ ],
296
+ [
297
+ "De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
298
+ "EVALDO_ANDRADA_CORREA",
299
+ null,
300
+ "brrs"
301
+ ],
302
+ [
303
+ "Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
304
+ "DORIS_ALEXANDER",
305
+ null,
306
+ "bralemanha"
307
+ ],
308
+ [
309
+ "Quem n\u00e3o arrisca, n\u00e3o petisca.",
310
+ "DONALDO_LUIZ_DE_ALMEIDA",
311
+ null,
312
+ "brgo"
313
+ ],
314
+ [
315
+ "A uni\u00e3o faz a for\u00e7a.",
316
+ "GERONCIO_HENRIQUE_NETO",
317
+ null,
318
+ "bral"
319
+ ],
320
+ [
321
+ "Em boca fechada n\u00e3o entra mosquito.",
322
+ "MALU_NATEL_FREIRE_WEBER",
323
+ null,
324
+ "brpr"
325
+ ]
326
+ ],
327
+ "eval_split_max_size": 256,
328
+ "eval_split_size": 0.01,
329
+ "use_speaker_weighted_sampler": false,
330
+ "speaker_weighted_sampler_alpha": 1.0,
331
+ "use_language_weighted_sampler": false,
332
+ "language_weighted_sampler_alpha": 1.0,
333
+ "use_length_weighted_sampler": false,
334
+ "length_weighted_sampler_alpha": 1.0,
335
+ "model_args": {
336
+ "num_chars": 266,
337
+ "out_channels": 513,
338
+ "spec_segment_size": 62,
339
+ "hidden_channels": 192,
340
+ "use_adaptive_weight_text_encoder": false,
341
+ "use_perfect_class_batch_sampler": true,
342
+ "perfect_class_batch_sampler_key": "language",
343
+ "hidden_channels_ffn_text_encoder": 768,
344
+ "num_heads_text_encoder": 2,
345
+ "num_layers_text_encoder": 10,
346
+ "kernel_size_text_encoder": 3,
347
+ "dropout_p_text_encoder": 0.1,
348
+ "dropout_p_duration_predictor": 0.5,
349
+ "kernel_size_posterior_encoder": 5,
350
+ "dilation_rate_posterior_encoder": 1,
351
+ "num_layers_posterior_encoder": 16,
352
+ "kernel_size_flow": 5,
353
+ "dilation_rate_flow": 1,
354
+ "num_layers_flow": 4,
355
+ "resblock_type_decoder": "2",
356
+ "resblock_kernel_sizes_decoder": [
357
+ 3,
358
+ 7,
359
+ 11
360
+ ],
361
+ "resblock_dilation_sizes_decoder": [
362
+ [
363
+ 1,
364
+ 3,
365
+ 5
366
+ ],
367
+ [
368
+ 1,
369
+ 3,
370
+ 5
371
+ ],
372
+ [
373
+ 1,
374
+ 3,
375
+ 5
376
+ ]
377
+ ],
378
+ "upsample_rates_decoder": [
379
+ 8,
380
+ 8,
381
+ 2,
382
+ 2
383
+ ],
384
+ "upsample_initial_channel_decoder": 512,
385
+ "upsample_kernel_sizes_decoder": [
386
+ 16,
387
+ 16,
388
+ 4,
389
+ 4
390
+ ],
391
+ "periods_multi_period_discriminator": [
392
+ 2,
393
+ 3,
394
+ 5,
395
+ 7,
396
+ 11
397
+ ],
398
+ "use_sdp": true,
399
+ "noise_scale": 1.0,
400
+ "inference_noise_scale": 0.667,
401
+ "length_scale": 1,
402
+ "noise_scale_dp": 1.0,
403
+ "inference_noise_scale_dp": 1.0,
404
+ "max_inference_len": null,
405
+ "init_discriminator": true,
406
+ "use_spectral_norm_disriminator": false,
407
+ "use_speaker_embedding": false,
408
+ "num_speakers": 0,
409
+ "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth",
410
+ "d_vector_file": [
411
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
412
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
413
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
414
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
415
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
416
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
417
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
418
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
419
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
420
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
421
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
422
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
423
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
424
+ ],
425
+ "speaker_embedding_channels": 256,
426
+ "use_d_vector_file": true,
427
+ "d_vector_dim": 512,
428
+ "detach_dp_input": true,
429
+ "use_language_embedding": true,
430
+ "embedded_language_dim": 4,
431
+ "num_languages": 0,
432
+ "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json",
433
+ "use_speaker_encoder_as_loss": false,
434
+ "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
435
+ "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
436
+ "condition_dp_on_speaker": true,
437
+ "freeze_encoder": false,
438
+ "freeze_DP": false,
439
+ "freeze_PE": false,
440
+ "freeze_flow_decoder": false,
441
+ "freeze_waveform_decoder": false,
442
+ "encoder_sample_rate": null,
443
+ "interpolate_z": true,
444
+ "reinit_DP": false,
445
+ "reinit_text_encoder": false
446
+ },
447
+ "lr_gen": 0.0002,
448
+ "lr_disc": 0.0002,
449
+ "lr_scheduler_gen": "ExponentialLR",
450
+ "lr_scheduler_gen_params": {
451
+ "gamma": 0.999875,
452
+ "last_epoch": -1
453
+ },
454
+ "lr_scheduler_disc": "ExponentialLR",
455
+ "lr_scheduler_disc_params": {
456
+ "gamma": 0.999875,
457
+ "last_epoch": -1
458
+ },
459
+ "kl_loss_alpha": 1.0,
460
+ "disc_loss_alpha": 1.0,
461
+ "gen_loss_alpha": 1.0,
462
+ "feat_loss_alpha": 1.0,
463
+ "mel_loss_alpha": 45.0,
464
+ "dur_loss_alpha": 1.0,
465
+ "speaker_encoder_loss_alpha": 9.0,
466
+ "return_wav": true,
467
+ "use_weighted_sampler": true,
468
+ "weighted_sampler_attrs": {
469
+ "language": 1.0
470
+ },
471
+ "weighted_sampler_multipliers": {},
472
+ "r": 1,
473
+ "num_speakers": 0,
474
+ "use_speaker_embedding": false,
475
+ "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth",
476
+ "speaker_embedding_channels": 256,
477
+ "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json",
478
+ "use_language_embedding": true,
479
+ "use_d_vector_file": true,
480
+ "d_vector_file": [
481
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
482
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
483
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
484
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
485
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
486
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
487
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
488
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
489
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
490
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
491
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
492
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
493
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
494
+ ],
495
+ "d_vector_dim": 512
496
+ }
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bral": 0,
3
+ "bralemanha": 1,
4
+ "brba": 2,
5
+ "brce": 3,
6
+ "brgo": 4,
7
+ "brmg": 5,
8
+ "brpb": 6,
9
+ "brpe": 7,
10
+ "brportugal": 8,
11
+ "brpr": 9,
12
+ "brrj": 10,
13
+ "brrs": 11,
14
+ "brsp": 12
15
+ }
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
3
+ size 3296
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/train_syntacc_baseline.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.bin.compute_embeddings import compute_embeddings
7
+ from TTS.bin.resample import resample_files
8
+ from TTS.config.shared_configs import BaseDatasetConfig
9
+ from TTS.tts.configs.vits_config import VitsConfig
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
12
+ from TTS.utils.downloaders import download_libri_tts
13
+ from torch.utils.data import DataLoader
14
+ from TTS.utils.samplers import PerfectBatchSampler
15
+ torch.set_num_threads(24)
16
+
17
+ # pylint: disable=W0105
18
+ """
19
+ This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
20
+ YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
21
+ """
22
+ CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
23
+
24
+ # Name of the run for the Trainer
25
+ RUN_NAME = "YourTTS-Baseline-PT"
26
+
27
+ # Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
28
+ OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
29
+
30
+ # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
31
+ RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
32
+
33
+ # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
34
+ SKIP_TRAIN_EPOCH = False
35
+
36
+ # Set here the batch size to be used in training and evaluation
37
+ BATCH_SIZE = 26
38
+
39
+ # Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
40
+ # Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
41
+ SAMPLE_RATE = 16000
42
+
43
+
44
+ DASHBOARD_LOGGER="tensorboard"
45
+ LOGGER_URI = None
46
+
47
+ DASHBOARD_LOGGER = "clearml"
48
+ LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
49
+
50
+
51
+
52
+ # Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
53
+ MAX_AUDIO_LEN_IN_SECONDS = float("inf")
54
+
55
+ # Define here the datasets config
56
+ brpb_train_config = BaseDatasetConfig(
57
+ formatter="coqui",
58
+ dataset_name="mupe",
59
+ meta_file_train="metadata_coqui_brpb.csv",
60
+ path="/raid/datasets/MUPE/dataset/mupe/",
61
+ language="brpb"
62
+ )
63
+
64
+ brba_train_config = BaseDatasetConfig(
65
+ formatter="coqui",
66
+ dataset_name="mupe",
67
+ meta_file_train="metadata_coqui_brba.csv",
68
+ path="/raid/datasets/MUPE/dataset/mupe/",
69
+ language="brba"
70
+ )
71
+
72
+ brportugal_train_config = BaseDatasetConfig(
73
+ formatter="coqui",
74
+ dataset_name="mupe",
75
+ meta_file_train="metadata_coqui_brportugal.csv",
76
+ path="/raid/datasets/MUPE/dataset/mupe/",
77
+ language="brportugal"
78
+ )
79
+
80
+ brsp_train_config = BaseDatasetConfig(
81
+ formatter="coqui",
82
+ dataset_name="mupe",
83
+ meta_file_train="metadata_coqui_brsp.csv",
84
+ path="/raid/datasets/MUPE/dataset/mupe/",
85
+ language="brsp"
86
+ )
87
+
88
+ brpe_train_config = BaseDatasetConfig(
89
+ formatter="coqui",
90
+ dataset_name="mupe",
91
+ meta_file_train="metadata_coqui_brpe.csv",
92
+ path="/raid/datasets/MUPE/dataset/mupe/",
93
+ language="brpe"
94
+ )
95
+
96
+ brmg_train_config = BaseDatasetConfig(
97
+ formatter="coqui",
98
+ dataset_name="mupe",
99
+ meta_file_train="metadata_coqui_brmg.csv",
100
+ path="/raid/datasets/MUPE/dataset/mupe/",
101
+ language="brmg"
102
+ )
103
+
104
+ brrj_train_config = BaseDatasetConfig(
105
+ formatter="coqui",
106
+ dataset_name="mupe",
107
+ meta_file_train="metadata_coqui_brrj.csv",
108
+ path="/raid/datasets/MUPE/dataset/mupe/",
109
+ language="brrj"
110
+ )
111
+
112
+ brce_train_config = BaseDatasetConfig(
113
+ formatter="coqui",
114
+ dataset_name="mupe",
115
+ meta_file_train="metadata_coqui_brce.csv",
116
+ path="/raid/datasets/MUPE/dataset/mupe/",
117
+ language="brce"
118
+ )
119
+
120
+ brrs_train_config = BaseDatasetConfig(
121
+ formatter="coqui",
122
+ dataset_name="mupe",
123
+ meta_file_train="metadata_coqui_brrs.csv",
124
+ path="/raid/datasets/MUPE/dataset/mupe/",
125
+ language="brrs"
126
+ )
127
+
128
+ bralemanha_train_config = BaseDatasetConfig(
129
+ formatter="coqui",
130
+ dataset_name="mupe",
131
+ meta_file_train="metadata_coqui_bralemanha.csv",
132
+ path="/raid/datasets/MUPE/dataset/mupe/",
133
+ language="bralemanha"
134
+ )
135
+
136
+ brgo_train_config = BaseDatasetConfig(
137
+ formatter="coqui",
138
+ dataset_name="mupe",
139
+ meta_file_train="metadata_coqui_brgo.csv",
140
+ path="/raid/datasets/MUPE/dataset/mupe/",
141
+ language="brgo"
142
+ )
143
+
144
+ bral_train_config = BaseDatasetConfig(
145
+ formatter="coqui",
146
+ dataset_name="mupe",
147
+ meta_file_train="metadata_coqui_bral.csv",
148
+ path="/raid/datasets/MUPE/dataset/mupe/",
149
+ language="bral"
150
+ )
151
+
152
+ brpr_train_config = BaseDatasetConfig(
153
+ formatter="coqui",
154
+ dataset_name="mupe",
155
+ meta_file_train="metadata_coqui_brpr.csv",
156
+ path="/raid/datasets/MUPE/dataset/mupe/",
157
+ language="brpr"
158
+ )
159
+
160
+ bres_train_config = BaseDatasetConfig(
161
+ formatter="coqui",
162
+ dataset_name="mupe",
163
+ meta_file_train="metadata_coqui_bres.csv",
164
+ path="/raid/datasets/MUPE/dataset/mupe/",
165
+ language="bres"
166
+ )
167
+
168
+ brpi_train_config = BaseDatasetConfig(
169
+ formatter="coqui",
170
+ dataset_name="mupe",
171
+ meta_file_train="metadata_coqui_brpi.csv",
172
+ path="/raid/datasets/MUPE/dataset/mupe/",
173
+ language="brpi"
174
+ )
175
+
176
+ # bres_train_config, brpi_train_config no files found
177
+ DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
178
+
179
+
180
+ ### Extract speaker embeddings
181
+ SPEAKER_ENCODER_CHECKPOINT_PATH = (
182
+ "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
183
+ )
184
+ SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
185
+
186
+ D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
187
+
188
+ # Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
189
+ for dataset_conf in DATASETS_CONFIG_LIST:
190
+ # Check if the embeddings weren't already computed, if not compute it
191
+ embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
192
+ if not os.path.isfile(embeddings_file):
193
+ print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
194
+ compute_embeddings(
195
+ SPEAKER_ENCODER_CHECKPOINT_PATH,
196
+ SPEAKER_ENCODER_CONFIG_PATH,
197
+ embeddings_file,
198
+ old_speakers_file=None,
199
+ config_dataset_path=None,
200
+ formatter_name=dataset_conf.formatter,
201
+ dataset_name=dataset_conf.dataset_name,
202
+ dataset_path=dataset_conf.path,
203
+ meta_file_train=dataset_conf.meta_file_train,
204
+ meta_file_val=dataset_conf.meta_file_val,
205
+ disable_cuda=False,
206
+ no_eval=False,
207
+ )
208
+ D_VECTOR_FILES.append(embeddings_file)
209
+
210
+
211
+ # Audio config used in training.
212
+ audio_config = VitsAudioConfig(
213
+ sample_rate=SAMPLE_RATE,
214
+ hop_length=256,
215
+ win_length=1024,
216
+ fft_size=1024,
217
+ mel_fmin=0.0,
218
+ mel_fmax=None,
219
+ num_mels=80,
220
+ )
221
+
222
+ # Init VITSArgs setting the arguments that are needed for the YourTTS model
223
+ model_args = VitsArgs(
224
+ spec_segment_size=62,
225
+ hidden_channels=192,
226
+ hidden_channels_ffn_text_encoder=768,
227
+ num_heads_text_encoder=2,
228
+ num_layers_text_encoder=10,
229
+ kernel_size_text_encoder=3,
230
+ dropout_p_text_encoder=0.1,
231
+ d_vector_file=D_VECTOR_FILES,
232
+ use_d_vector_file=True,
233
+ d_vector_dim=512,
234
+ speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
235
+ speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
236
+ resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
237
+ # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
238
+ use_speaker_encoder_as_loss=False,
239
+ # Useful parameters to enable multilingual training
240
+ use_language_embedding=True,
241
+ embedded_language_dim=4,
242
+ use_adaptive_weight_text_encoder=False,
243
+ use_perfect_class_batch_sampler=True,
244
+ perfect_class_batch_sampler_key="language"
245
+ )
246
+
247
+ # General training config, here you can change the batch size and others useful parameters
248
+ config = VitsConfig(
249
+ output_path=OUT_PATH,
250
+ model_args=model_args,
251
+ run_name=RUN_NAME,
252
+ project_name="SYNTACC",
253
+ run_description="""
254
+ - YourTTS with SYNTACC text encoder
255
+ """,
256
+ dashboard_logger=DASHBOARD_LOGGER,
257
+ logger_uri=LOGGER_URI,
258
+ audio=audio_config,
259
+ batch_size=BATCH_SIZE,
260
+ batch_group_size=48,
261
+ eval_batch_size=BATCH_SIZE,
262
+ num_loader_workers=8,
263
+ eval_split_max_size=256,
264
+ print_step=50,
265
+ plot_step=100,
266
+ log_model_step=1000,
267
+ save_step=5000,
268
+ save_n_checkpoints=2,
269
+ save_checkpoints=True,
270
+ # target_loss="loss_1",
271
+ print_eval=False,
272
+ use_phonemes=False,
273
+ phonemizer="espeak",
274
+ phoneme_language="en",
275
+ compute_input_seq_cache=True,
276
+ add_blank=True,
277
+ text_cleaner="multilingual_cleaners",
278
+ characters=CharactersConfig(
279
+ characters_class="TTS.tts.models.vits.VitsCharacters",
280
+ pad="_",
281
+ eos="&",
282
+ bos="*",
283
+ blank=None,
284
+ characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
285
+ punctuations="\u2014!'(),-.:;?\u00bf ",
286
+ phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
287
+ is_unique=True,
288
+ is_sorted=True,
289
+ ),
290
+ phoneme_cache_path=None,
291
+ precompute_num_workers=12,
292
+ start_by_longest=True,
293
+ datasets=DATASETS_CONFIG_LIST,
294
+ cudnn_benchmark=False,
295
+ max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
296
+ mixed_precision=False,
297
+ test_sentences=[
298
+ #GUSTAVO: apenas pessoas do treino
299
+ ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
300
+ ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
301
+ ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
302
+ ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
303
+ ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
304
+ ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
305
+ ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
306
+ ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
307
+ ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
308
+ ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
309
+ ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
310
+ ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
311
+ ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
312
+ # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
313
+ # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
314
+ ],
315
+ # Enable the weighted sampler
316
+ use_weighted_sampler=True,
317
+ # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
318
+ # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
319
+ weighted_sampler_attrs={"language": 1.0},
320
+ weighted_sampler_multipliers={
321
+ # "speaker_name": {
322
+ # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
323
+ # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
324
+ # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
325
+ # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
326
+ # }
327
+ },
328
+ # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
329
+ speaker_encoder_loss_alpha=9.0,
330
+ )
331
+
332
+ # Load all the datasets samples and split traning and evaluation sets
333
+ train_samples, eval_samples = load_tts_samples(
334
+ config.datasets,
335
+ eval_split=True,
336
+ eval_split_max_size=config.eval_split_max_size,
337
+ eval_split_size=config.eval_split_size,
338
+ )
339
+
340
+ # Init the model
341
+ model = Vits.init_from_config(config)
342
+
343
+ # Init the trainer and 🚀
344
+ trainer = Trainer(
345
+ TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
346
+ config,
347
+ output_path=OUT_PATH,
348
+ model=model,
349
+ train_samples=train_samples,
350
+ eval_samples=eval_samples,
351
+ )
352
+ trainer.fit()
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/trainer_0_log.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ddf81cb4061c7e47bd824c3ebb109cc02bc31ab79ee21e4e69d60d32aca454b
3
+ size 1794644
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cb1d72efa1724f811028b33a003492d486385a35846b2a09aae34ece757cbab
3
+ size 1044057134
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/best_model_78415.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cb1d72efa1724f811028b33a003492d486385a35846b2a09aae34ece757cbab
3
+ size 1044057134
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_80000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5208e907be9e2db12b928d9c2b1abd4df0b757f34703f124db1a326449a882f2
3
+ size 1044057198
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f77b5c81d12b629d83cce93a9b0318eb1d41888e6e985706fa275841c92444d3
3
+ size 1044057198
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/config.json ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/raid/datasets/MUPE/Experiments/runs",
3
+ "logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
4
+ "run_name": "YourTTS-Syntacc-PT",
5
+ "project_name": "SYNTACC",
6
+ "run_description": "\n - YourTTS with SYNTACC text encoder\n ",
7
+ "print_step": 50,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "clearml",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": 1000,
14
+ "save_step": 5000,
15
+ "save_n_checkpoints": 2,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 10000,
19
+ "target_loss": null,
20
+ "print_eval": false,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 26,
30
+ "eval_batch_size": 26,
31
+ "grad_clip": [
32
+ 1000,
33
+ 1000
34
+ ],
35
+ "scheduler_after_epoch": true,
36
+ "lr": 0.001,
37
+ "optimizer": "AdamW",
38
+ "optimizer_params": {
39
+ "betas": [
40
+ 0.8,
41
+ 0.99
42
+ ],
43
+ "eps": 1e-09,
44
+ "weight_decay": 0.01
45
+ },
46
+ "lr_scheduler": null,
47
+ "lr_scheduler_params": {},
48
+ "use_grad_scaler": false,
49
+ "allow_tf32": false,
50
+ "cudnn_enable": true,
51
+ "cudnn_deterministic": false,
52
+ "cudnn_benchmark": false,
53
+ "training_seed": 54321,
54
+ "model": "vits",
55
+ "num_loader_workers": 8,
56
+ "num_eval_loader_workers": 0,
57
+ "use_noise_augment": false,
58
+ "audio": {
59
+ "fft_size": 1024,
60
+ "sample_rate": 16000,
61
+ "win_length": 1024,
62
+ "hop_length": 256,
63
+ "num_mels": 80,
64
+ "mel_fmin": 0.0,
65
+ "mel_fmax": null
66
+ },
67
+ "use_phonemes": false,
68
+ "phonemizer": "espeak",
69
+ "phoneme_language": "en",
70
+ "compute_input_seq_cache": true,
71
+ "text_cleaner": "multilingual_cleaners",
72
+ "enable_eos_bos_chars": false,
73
+ "test_sentences_file": "",
74
+ "phoneme_cache_path": null,
75
+ "characters": {
76
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
77
+ "vocab_dict": null,
78
+ "pad": "_",
79
+ "eos": "&",
80
+ "bos": "*",
81
+ "blank": null,
82
+ "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
83
+ "punctuations": "\u2014!'(),-.:;?\u00bf ",
84
+ "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
85
+ "is_unique": true,
86
+ "is_sorted": true
87
+ },
88
+ "add_blank": true,
89
+ "batch_group_size": 48,
90
+ "loss_masking": null,
91
+ "min_audio_len": 1,
92
+ "max_audio_len": Infinity,
93
+ "min_text_len": 1,
94
+ "max_text_len": Infinity,
95
+ "compute_f0": false,
96
+ "compute_energy": false,
97
+ "compute_linear_spec": true,
98
+ "precompute_num_workers": 12,
99
+ "start_by_longest": true,
100
+ "shuffle": false,
101
+ "drop_last": false,
102
+ "datasets": [
103
+ {
104
+ "formatter": "coqui",
105
+ "dataset_name": "mupe",
106
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
107
+ "meta_file_train": "metadata_coqui_brpb.csv",
108
+ "ignored_speakers": null,
109
+ "language": "brpb",
110
+ "phonemizer": "",
111
+ "meta_file_val": "",
112
+ "meta_file_attn_mask": ""
113
+ },
114
+ {
115
+ "formatter": "coqui",
116
+ "dataset_name": "mupe",
117
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
118
+ "meta_file_train": "metadata_coqui_brba.csv",
119
+ "ignored_speakers": null,
120
+ "language": "brba",
121
+ "phonemizer": "",
122
+ "meta_file_val": "",
123
+ "meta_file_attn_mask": ""
124
+ },
125
+ {
126
+ "formatter": "coqui",
127
+ "dataset_name": "mupe",
128
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
129
+ "meta_file_train": "metadata_coqui_brportugal.csv",
130
+ "ignored_speakers": null,
131
+ "language": "brportugal",
132
+ "phonemizer": "",
133
+ "meta_file_val": "",
134
+ "meta_file_attn_mask": ""
135
+ },
136
+ {
137
+ "formatter": "coqui",
138
+ "dataset_name": "mupe",
139
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
140
+ "meta_file_train": "metadata_coqui_brsp.csv",
141
+ "ignored_speakers": null,
142
+ "language": "brsp",
143
+ "phonemizer": "",
144
+ "meta_file_val": "",
145
+ "meta_file_attn_mask": ""
146
+ },
147
+ {
148
+ "formatter": "coqui",
149
+ "dataset_name": "mupe",
150
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
151
+ "meta_file_train": "metadata_coqui_brpe.csv",
152
+ "ignored_speakers": null,
153
+ "language": "brpe",
154
+ "phonemizer": "",
155
+ "meta_file_val": "",
156
+ "meta_file_attn_mask": ""
157
+ },
158
+ {
159
+ "formatter": "coqui",
160
+ "dataset_name": "mupe",
161
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
162
+ "meta_file_train": "metadata_coqui_brmg.csv",
163
+ "ignored_speakers": null,
164
+ "language": "brmg",
165
+ "phonemizer": "",
166
+ "meta_file_val": "",
167
+ "meta_file_attn_mask": ""
168
+ },
169
+ {
170
+ "formatter": "coqui",
171
+ "dataset_name": "mupe",
172
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
173
+ "meta_file_train": "metadata_coqui_brrj.csv",
174
+ "ignored_speakers": null,
175
+ "language": "brrj",
176
+ "phonemizer": "",
177
+ "meta_file_val": "",
178
+ "meta_file_attn_mask": ""
179
+ },
180
+ {
181
+ "formatter": "coqui",
182
+ "dataset_name": "mupe",
183
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
184
+ "meta_file_train": "metadata_coqui_brce.csv",
185
+ "ignored_speakers": null,
186
+ "language": "brce",
187
+ "phonemizer": "",
188
+ "meta_file_val": "",
189
+ "meta_file_attn_mask": ""
190
+ },
191
+ {
192
+ "formatter": "coqui",
193
+ "dataset_name": "mupe",
194
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
195
+ "meta_file_train": "metadata_coqui_brrs.csv",
196
+ "ignored_speakers": null,
197
+ "language": "brrs",
198
+ "phonemizer": "",
199
+ "meta_file_val": "",
200
+ "meta_file_attn_mask": ""
201
+ },
202
+ {
203
+ "formatter": "coqui",
204
+ "dataset_name": "mupe",
205
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
206
+ "meta_file_train": "metadata_coqui_bralemanha.csv",
207
+ "ignored_speakers": null,
208
+ "language": "bralemanha",
209
+ "phonemizer": "",
210
+ "meta_file_val": "",
211
+ "meta_file_attn_mask": ""
212
+ },
213
+ {
214
+ "formatter": "coqui",
215
+ "dataset_name": "mupe",
216
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
217
+ "meta_file_train": "metadata_coqui_brgo.csv",
218
+ "ignored_speakers": null,
219
+ "language": "brgo",
220
+ "phonemizer": "",
221
+ "meta_file_val": "",
222
+ "meta_file_attn_mask": ""
223
+ },
224
+ {
225
+ "formatter": "coqui",
226
+ "dataset_name": "mupe",
227
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
228
+ "meta_file_train": "metadata_coqui_bral.csv",
229
+ "ignored_speakers": null,
230
+ "language": "bral",
231
+ "phonemizer": "",
232
+ "meta_file_val": "",
233
+ "meta_file_attn_mask": ""
234
+ },
235
+ {
236
+ "formatter": "coqui",
237
+ "dataset_name": "mupe",
238
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
239
+ "meta_file_train": "metadata_coqui_brpr.csv",
240
+ "ignored_speakers": null,
241
+ "language": "brpr",
242
+ "phonemizer": "",
243
+ "meta_file_val": "",
244
+ "meta_file_attn_mask": ""
245
+ }
246
+ ],
247
+ "test_sentences": [
248
+ [
249
+ "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
250
+ "EDILEINE_FONSECA",
251
+ null,
252
+ "brsp"
253
+ ],
254
+ [
255
+ "Quem semeia ventos, colhe tempestades.",
256
+ "JOSE_PAULO_DE_ARAUJO",
257
+ null,
258
+ "brpb"
259
+ ],
260
+ [
261
+ "O olho do dono \u00e9 que engorda o gado.",
262
+ "VITOR_RAFAEL_OLIVEIRA_ALVES",
263
+ null,
264
+ "brba"
265
+ ],
266
+ [
267
+ "\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
268
+ "MARIA_AURORA_FELIX",
269
+ null,
270
+ "brportugal"
271
+ ],
272
+ [
273
+ "Quem espera sempre alcan\u00e7a.",
274
+ "ANTONIO_DE_AMORIM_COSTA",
275
+ null,
276
+ "brpe"
277
+ ],
278
+ [
279
+ "Cada macaco no seu galho.",
280
+ "ALCIDES_DE_LIMA",
281
+ null,
282
+ "brmg"
283
+ ],
284
+ [
285
+ "Em terra de cego, quem tem um olho \u00e9 rei.",
286
+ "ALUISIO_SOARES_DE_SOUSA",
287
+ null,
288
+ "brrj"
289
+ ],
290
+ [
291
+ "A ocasi\u00e3o faz o ladr\u00e3o.",
292
+ "FRANCISCO_JOSE_MOREIRA_MOTA",
293
+ null,
294
+ "brce"
295
+ ],
296
+ [
297
+ "De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
298
+ "EVALDO_ANDRADA_CORREA",
299
+ null,
300
+ "brrs"
301
+ ],
302
+ [
303
+ "Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
304
+ "DORIS_ALEXANDER",
305
+ null,
306
+ "bralemanha"
307
+ ],
308
+ [
309
+ "Quem n\u00e3o arrisca, n\u00e3o petisca.",
310
+ "DONALDO_LUIZ_DE_ALMEIDA",
311
+ null,
312
+ "brgo"
313
+ ],
314
+ [
315
+ "A uni\u00e3o faz a for\u00e7a.",
316
+ "GERONCIO_HENRIQUE_NETO",
317
+ null,
318
+ "bral"
319
+ ],
320
+ [
321
+ "Em boca fechada n\u00e3o entra mosquito.",
322
+ "MALU_NATEL_FREIRE_WEBER",
323
+ null,
324
+ "brpr"
325
+ ]
326
+ ],
327
+ "eval_split_max_size": 256,
328
+ "eval_split_size": 0.01,
329
+ "use_speaker_weighted_sampler": false,
330
+ "speaker_weighted_sampler_alpha": 1.0,
331
+ "use_language_weighted_sampler": false,
332
+ "language_weighted_sampler_alpha": 1.0,
333
+ "use_length_weighted_sampler": false,
334
+ "length_weighted_sampler_alpha": 1.0,
335
+ "model_args": {
336
+ "num_chars": 266,
337
+ "out_channels": 513,
338
+ "spec_segment_size": 62,
339
+ "hidden_channels": 192,
340
+ "use_adaptive_weight_text_encoder": true,
341
+ "use_perfect_class_batch_sampler": true,
342
+ "perfect_class_batch_sampler_key": "language",
343
+ "hidden_channels_ffn_text_encoder": 768,
344
+ "num_heads_text_encoder": 2,
345
+ "num_layers_text_encoder": 10,
346
+ "kernel_size_text_encoder": 3,
347
+ "dropout_p_text_encoder": 0.1,
348
+ "dropout_p_duration_predictor": 0.5,
349
+ "kernel_size_posterior_encoder": 5,
350
+ "dilation_rate_posterior_encoder": 1,
351
+ "num_layers_posterior_encoder": 16,
352
+ "kernel_size_flow": 5,
353
+ "dilation_rate_flow": 1,
354
+ "num_layers_flow": 4,
355
+ "resblock_type_decoder": "2",
356
+ "resblock_kernel_sizes_decoder": [
357
+ 3,
358
+ 7,
359
+ 11
360
+ ],
361
+ "resblock_dilation_sizes_decoder": [
362
+ [
363
+ 1,
364
+ 3,
365
+ 5
366
+ ],
367
+ [
368
+ 1,
369
+ 3,
370
+ 5
371
+ ],
372
+ [
373
+ 1,
374
+ 3,
375
+ 5
376
+ ]
377
+ ],
378
+ "upsample_rates_decoder": [
379
+ 8,
380
+ 8,
381
+ 2,
382
+ 2
383
+ ],
384
+ "upsample_initial_channel_decoder": 512,
385
+ "upsample_kernel_sizes_decoder": [
386
+ 16,
387
+ 16,
388
+ 4,
389
+ 4
390
+ ],
391
+ "periods_multi_period_discriminator": [
392
+ 2,
393
+ 3,
394
+ 5,
395
+ 7,
396
+ 11
397
+ ],
398
+ "use_sdp": true,
399
+ "noise_scale": 1.0,
400
+ "inference_noise_scale": 0.667,
401
+ "length_scale": 1,
402
+ "noise_scale_dp": 1.0,
403
+ "inference_noise_scale_dp": 1.0,
404
+ "max_inference_len": null,
405
+ "init_discriminator": true,
406
+ "use_spectral_norm_disriminator": false,
407
+ "use_speaker_embedding": false,
408
+ "num_speakers": 0,
409
+ "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/speakers.pth",
410
+ "d_vector_file": [
411
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
412
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
413
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
414
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
415
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
416
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
417
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
418
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
419
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
420
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
421
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
422
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
423
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
424
+ ],
425
+ "speaker_embedding_channels": 256,
426
+ "use_d_vector_file": true,
427
+ "d_vector_dim": 512,
428
+ "detach_dp_input": true,
429
+ "use_language_embedding": false,
430
+ "embedded_language_dim": 4,
431
+ "num_languages": 0,
432
+ "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/language_ids.json",
433
+ "use_speaker_encoder_as_loss": false,
434
+ "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
435
+ "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
436
+ "condition_dp_on_speaker": true,
437
+ "freeze_encoder": false,
438
+ "freeze_DP": false,
439
+ "freeze_PE": false,
440
+ "freeze_flow_decoder": false,
441
+ "freeze_waveform_decoder": false,
442
+ "encoder_sample_rate": null,
443
+ "interpolate_z": true,
444
+ "reinit_DP": false,
445
+ "reinit_text_encoder": false
446
+ },
447
+ "lr_gen": 0.0002,
448
+ "lr_disc": 0.0002,
449
+ "lr_scheduler_gen": "ExponentialLR",
450
+ "lr_scheduler_gen_params": {
451
+ "gamma": 0.999875,
452
+ "last_epoch": -1
453
+ },
454
+ "lr_scheduler_disc": "ExponentialLR",
455
+ "lr_scheduler_disc_params": {
456
+ "gamma": 0.999875,
457
+ "last_epoch": -1
458
+ },
459
+ "kl_loss_alpha": 1.0,
460
+ "disc_loss_alpha": 1.0,
461
+ "gen_loss_alpha": 1.0,
462
+ "feat_loss_alpha": 1.0,
463
+ "mel_loss_alpha": 45.0,
464
+ "dur_loss_alpha": 1.0,
465
+ "speaker_encoder_loss_alpha": 9.0,
466
+ "return_wav": true,
467
+ "use_weighted_sampler": true,
468
+ "weighted_sampler_attrs": {
469
+ "language": 1.0
470
+ },
471
+ "weighted_sampler_multipliers": {},
472
+ "r": 1,
473
+ "num_speakers": 0,
474
+ "use_speaker_embedding": false,
475
+ "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/speakers.pth",
476
+ "speaker_embedding_channels": 256,
477
+ "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/language_ids.json",
478
+ "use_language_embedding": false,
479
+ "use_d_vector_file": true,
480
+ "d_vector_file": [
481
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
482
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
483
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
484
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
485
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
486
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
487
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
488
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
489
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
490
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
491
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
492
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
493
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
494
+ ],
495
+ "d_vector_dim": 512
496
+ }
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/language_ids.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bral": 0,
3
+ "bralemanha": 1,
4
+ "brba": 2,
5
+ "brce": 3,
6
+ "brgo": 4,
7
+ "brmg": 5,
8
+ "brpb": 6,
9
+ "brpe": 7,
10
+ "brportugal": 8,
11
+ "brpr": 9,
12
+ "brrj": 10,
13
+ "brrs": 11,
14
+ "brsp": 12
15
+ }
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
3
+ size 3296
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/train_syntacc.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.bin.compute_embeddings import compute_embeddings
7
+ from TTS.bin.resample import resample_files
8
+ from TTS.config.shared_configs import BaseDatasetConfig
9
+ from TTS.tts.configs.vits_config import VitsConfig
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
12
+ from TTS.utils.downloaders import download_libri_tts
13
+ from torch.utils.data import DataLoader
14
+ from TTS.utils.samplers import PerfectBatchSampler
15
+ torch.set_num_threads(24)
16
+
17
+ # pylint: disable=W0105
18
+ """
19
+ This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
20
+ YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
21
+ """
22
+ CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
23
+
24
+ # Name of the run for the Trainer
25
+ RUN_NAME = "YourTTS-Syntacc-PT"
26
+
27
+ # Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
28
+ OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
29
+
30
+ # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
31
+ RESTORE_PATH = "/raid/edresson/dev/Paper/cml_tts/checkpoints_yourtts_cml_tts_dataset/best_model.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
32
+
33
+ # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
34
+ SKIP_TRAIN_EPOCH = False
35
+
36
+ # Set here the batch size to be used in training and evaluation
37
+ BATCH_SIZE = 26
38
+
39
+ # Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
40
+ # Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
41
+ SAMPLE_RATE = 16000
42
+
43
+
44
+ DASHBOARD_LOGGER="tensorboard"
45
+ LOGGER_URI = None
46
+
47
+ DASHBOARD_LOGGER = "clearml"
48
+ LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
49
+
50
+
51
+
52
+ # Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
53
+ MAX_AUDIO_LEN_IN_SECONDS = float("inf")
54
+
55
+ # Define here the datasets config
56
+ brpb_train_config = BaseDatasetConfig(
57
+ formatter="coqui",
58
+ dataset_name="mupe",
59
+ meta_file_train="metadata_coqui_brpb.csv",
60
+ path="/raid/datasets/MUPE/dataset/mupe/",
61
+ language="brpb"
62
+ )
63
+
64
+ brba_train_config = BaseDatasetConfig(
65
+ formatter="coqui",
66
+ dataset_name="mupe",
67
+ meta_file_train="metadata_coqui_brba.csv",
68
+ path="/raid/datasets/MUPE/dataset/mupe/",
69
+ language="brba"
70
+ )
71
+
72
+ brportugal_train_config = BaseDatasetConfig(
73
+ formatter="coqui",
74
+ dataset_name="mupe",
75
+ meta_file_train="metadata_coqui_brportugal.csv",
76
+ path="/raid/datasets/MUPE/dataset/mupe/",
77
+ language="brportugal"
78
+ )
79
+
80
+ brsp_train_config = BaseDatasetConfig(
81
+ formatter="coqui",
82
+ dataset_name="mupe",
83
+ meta_file_train="metadata_coqui_brsp.csv",
84
+ path="/raid/datasets/MUPE/dataset/mupe/",
85
+ language="brsp"
86
+ )
87
+
88
+ brpe_train_config = BaseDatasetConfig(
89
+ formatter="coqui",
90
+ dataset_name="mupe",
91
+ meta_file_train="metadata_coqui_brpe.csv",
92
+ path="/raid/datasets/MUPE/dataset/mupe/",
93
+ language="brpe"
94
+ )
95
+
96
+ brmg_train_config = BaseDatasetConfig(
97
+ formatter="coqui",
98
+ dataset_name="mupe",
99
+ meta_file_train="metadata_coqui_brmg.csv",
100
+ path="/raid/datasets/MUPE/dataset/mupe/",
101
+ language="brmg"
102
+ )
103
+
104
+ brrj_train_config = BaseDatasetConfig(
105
+ formatter="coqui",
106
+ dataset_name="mupe",
107
+ meta_file_train="metadata_coqui_brrj.csv",
108
+ path="/raid/datasets/MUPE/dataset/mupe/",
109
+ language="brrj"
110
+ )
111
+
112
+ brce_train_config = BaseDatasetConfig(
113
+ formatter="coqui",
114
+ dataset_name="mupe",
115
+ meta_file_train="metadata_coqui_brce.csv",
116
+ path="/raid/datasets/MUPE/dataset/mupe/",
117
+ language="brce"
118
+ )
119
+
120
+ brrs_train_config = BaseDatasetConfig(
121
+ formatter="coqui",
122
+ dataset_name="mupe",
123
+ meta_file_train="metadata_coqui_brrs.csv",
124
+ path="/raid/datasets/MUPE/dataset/mupe/",
125
+ language="brrs"
126
+ )
127
+
128
+ bralemanha_train_config = BaseDatasetConfig(
129
+ formatter="coqui",
130
+ dataset_name="mupe",
131
+ meta_file_train="metadata_coqui_bralemanha.csv",
132
+ path="/raid/datasets/MUPE/dataset/mupe/",
133
+ language="bralemanha"
134
+ )
135
+
136
+ brgo_train_config = BaseDatasetConfig(
137
+ formatter="coqui",
138
+ dataset_name="mupe",
139
+ meta_file_train="metadata_coqui_brgo.csv",
140
+ path="/raid/datasets/MUPE/dataset/mupe/",
141
+ language="brgo"
142
+ )
143
+
144
+ bral_train_config = BaseDatasetConfig(
145
+ formatter="coqui",
146
+ dataset_name="mupe",
147
+ meta_file_train="metadata_coqui_bral.csv",
148
+ path="/raid/datasets/MUPE/dataset/mupe/",
149
+ language="bral"
150
+ )
151
+
152
+ brpr_train_config = BaseDatasetConfig(
153
+ formatter="coqui",
154
+ dataset_name="mupe",
155
+ meta_file_train="metadata_coqui_brpr.csv",
156
+ path="/raid/datasets/MUPE/dataset/mupe/",
157
+ language="brpr"
158
+ )
159
+
160
+ bres_train_config = BaseDatasetConfig(
161
+ formatter="coqui",
162
+ dataset_name="mupe",
163
+ meta_file_train="metadata_coqui_bres.csv",
164
+ path="/raid/datasets/MUPE/dataset/mupe/",
165
+ language="bres"
166
+ )
167
+
168
+ brpi_train_config = BaseDatasetConfig(
169
+ formatter="coqui",
170
+ dataset_name="mupe",
171
+ meta_file_train="metadata_coqui_brpi.csv",
172
+ path="/raid/datasets/MUPE/dataset/mupe/",
173
+ language="brpi"
174
+ )
175
+
176
+ # bres_train_config, brpi_train_config no files found
177
+ DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
178
+
179
+
180
+ ### Extract speaker embeddings
181
+ SPEAKER_ENCODER_CHECKPOINT_PATH = (
182
+ "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
183
+ )
184
+ SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
185
+
186
+ D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
187
+
188
+ # Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
189
+ for dataset_conf in DATASETS_CONFIG_LIST:
190
+ # Check if the embeddings weren't already computed, if not compute it
191
+ embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
192
+ if not os.path.isfile(embeddings_file):
193
+ print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
194
+ compute_embeddings(
195
+ SPEAKER_ENCODER_CHECKPOINT_PATH,
196
+ SPEAKER_ENCODER_CONFIG_PATH,
197
+ embeddings_file,
198
+ old_speakers_file=None,
199
+ config_dataset_path=None,
200
+ formatter_name=dataset_conf.formatter,
201
+ dataset_name=dataset_conf.dataset_name,
202
+ dataset_path=dataset_conf.path,
203
+ meta_file_train=dataset_conf.meta_file_train,
204
+ meta_file_val=dataset_conf.meta_file_val,
205
+ disable_cuda=False,
206
+ no_eval=False,
207
+ )
208
+ D_VECTOR_FILES.append(embeddings_file)
209
+
210
+
211
+ # Audio config used in training.
212
+ audio_config = VitsAudioConfig(
213
+ sample_rate=SAMPLE_RATE,
214
+ hop_length=256,
215
+ win_length=1024,
216
+ fft_size=1024,
217
+ mel_fmin=0.0,
218
+ mel_fmax=None,
219
+ num_mels=80,
220
+ )
221
+
222
+ # Init VITSArgs setting the arguments that are needed for the YourTTS model
223
+ model_args = VitsArgs(
224
+ spec_segment_size=62,
225
+ hidden_channels=192,
226
+ hidden_channels_ffn_text_encoder=768,
227
+ num_heads_text_encoder=2,
228
+ num_layers_text_encoder=10,
229
+ kernel_size_text_encoder=3,
230
+ dropout_p_text_encoder=0.1,
231
+ d_vector_file=D_VECTOR_FILES,
232
+ use_d_vector_file=True,
233
+ d_vector_dim=512,
234
+ speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
235
+ speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
236
+ resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
237
+ # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
238
+ use_speaker_encoder_as_loss=False,
239
+ # Useful parameters to enable multilingual training
240
+ use_language_embedding=False,
241
+ embedded_language_dim=4,
242
+ use_adaptive_weight_text_encoder=True,
243
+ use_perfect_class_batch_sampler=True,
244
+ perfect_class_batch_sampler_key="language"
245
+ )
246
+
247
+ # General training config, here you can change the batch size and others useful parameters
248
+ config = VitsConfig(
249
+ output_path=OUT_PATH,
250
+ model_args=model_args,
251
+ run_name=RUN_NAME,
252
+ project_name="SYNTACC",
253
+ run_description="""
254
+ - YourTTS with SYNTACC text encoder
255
+ """,
256
+ dashboard_logger=DASHBOARD_LOGGER,
257
+ logger_uri=LOGGER_URI,
258
+ audio=audio_config,
259
+ batch_size=BATCH_SIZE,
260
+ batch_group_size=48,
261
+ eval_batch_size=BATCH_SIZE,
262
+ num_loader_workers=8,
263
+ eval_split_max_size=256,
264
+ print_step=50,
265
+ plot_step=100,
266
+ log_model_step=1000,
267
+ save_step=5000,
268
+ save_n_checkpoints=2,
269
+ save_checkpoints=True,
270
+ # target_loss="loss_1",
271
+ print_eval=False,
272
+ use_phonemes=False,
273
+ phonemizer="espeak",
274
+ phoneme_language="en",
275
+ compute_input_seq_cache=True,
276
+ add_blank=True,
277
+ text_cleaner="multilingual_cleaners",
278
+ characters=CharactersConfig(
279
+ characters_class="TTS.tts.models.vits.VitsCharacters",
280
+ pad="_",
281
+ eos="&",
282
+ bos="*",
283
+ blank=None,
284
+ characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
285
+ punctuations="\u2014!'(),-.:;?\u00bf ",
286
+ phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
287
+ is_unique=True,
288
+ is_sorted=True,
289
+ ),
290
+ phoneme_cache_path=None,
291
+ precompute_num_workers=12,
292
+ start_by_longest=True,
293
+ datasets=DATASETS_CONFIG_LIST,
294
+ cudnn_benchmark=False,
295
+ max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
296
+ mixed_precision=False,
297
+ test_sentences=[
298
+ #GUSTAVO: apenas pessoas do treino
299
+ ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
300
+ ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
301
+ ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
302
+ ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
303
+ ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
304
+ ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
305
+ ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
306
+ ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
307
+ ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
308
+ ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
309
+ ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
310
+ ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
311
+ ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
312
+ # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
313
+ # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
314
+ ],
315
+ # Enable the weighted sampler
316
+ use_weighted_sampler=True,
317
+ # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
318
+ # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
319
+ weighted_sampler_attrs={"language": 1.0},
320
+ weighted_sampler_multipliers={
321
+ # "speaker_name": {
322
+ # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
323
+ # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
324
+ # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
325
+ # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
326
+ # }
327
+ },
328
+ # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
329
+ speaker_encoder_loss_alpha=9.0,
330
+ )
331
+
332
+ # Load all the datasets samples and split traning and evaluation sets
333
+ train_samples, eval_samples = load_tts_samples(
334
+ config.datasets,
335
+ eval_split=True,
336
+ eval_split_max_size=config.eval_split_max_size,
337
+ eval_split_size=config.eval_split_size,
338
+ )
339
+
340
+ # Init the model
341
+ model = Vits.init_from_config(config)
342
+
343
+ # Init the trainer and 🚀
344
+ trainer = Trainer(
345
+ TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
346
+ config,
347
+ output_path=OUT_PATH,
348
+ model=model,
349
+ train_samples=train_samples,
350
+ eval_samples=eval_samples,
351
+ )
352
+ trainer.fit()
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/trainer_0_log.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92bf84450b9ef1865a5f553a00c5d3649069dd6b17b314e548a429a52a8a9f3f
3
+ size 1423682
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdcbc348b4c18b558e8d8b5409027adf5897da1fce86b72795aaaf3635d3cb90
3
+ size 1044057262
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/best_model_87818.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cdcbc348b4c18b558e8d8b5409027adf5897da1fce86b72795aaaf3635d3cb90
3
+ size 1044057262
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/config.json ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/raid/datasets/MUPE/Experiments/runs",
3
+ "logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
4
+ "run_name": "YourTTS-Syntacc-PT",
5
+ "project_name": "SYNTACC",
6
+ "run_description": "\n - YourTTS with SYNTACC text encoder\n ",
7
+ "print_step": 50,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "clearml",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": 1000,
14
+ "save_step": 5000,
15
+ "save_n_checkpoints": 2,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 10000,
19
+ "target_loss": null,
20
+ "print_eval": false,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 26,
30
+ "eval_batch_size": 26,
31
+ "grad_clip": [
32
+ 1000,
33
+ 1000
34
+ ],
35
+ "scheduler_after_epoch": true,
36
+ "lr": 0.001,
37
+ "optimizer": "AdamW",
38
+ "optimizer_params": {
39
+ "betas": [
40
+ 0.8,
41
+ 0.99
42
+ ],
43
+ "eps": 1e-09,
44
+ "weight_decay": 0.01
45
+ },
46
+ "lr_scheduler": null,
47
+ "lr_scheduler_params": {},
48
+ "use_grad_scaler": false,
49
+ "allow_tf32": false,
50
+ "cudnn_enable": true,
51
+ "cudnn_deterministic": false,
52
+ "cudnn_benchmark": false,
53
+ "training_seed": 54321,
54
+ "model": "vits",
55
+ "num_loader_workers": 8,
56
+ "num_eval_loader_workers": 0,
57
+ "use_noise_augment": false,
58
+ "audio": {
59
+ "fft_size": 1024,
60
+ "sample_rate": 16000,
61
+ "win_length": 1024,
62
+ "hop_length": 256,
63
+ "num_mels": 80,
64
+ "mel_fmin": 0.0,
65
+ "mel_fmax": null
66
+ },
67
+ "use_phonemes": false,
68
+ "phonemizer": "espeak",
69
+ "phoneme_language": "en",
70
+ "compute_input_seq_cache": true,
71
+ "text_cleaner": "multilingual_cleaners",
72
+ "enable_eos_bos_chars": false,
73
+ "test_sentences_file": "",
74
+ "phoneme_cache_path": null,
75
+ "characters": {
76
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
77
+ "vocab_dict": null,
78
+ "pad": "_",
79
+ "eos": "&",
80
+ "bos": "*",
81
+ "blank": null,
82
+ "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
83
+ "punctuations": "\u2014!'(),-.:;?\u00bf ",
84
+ "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
85
+ "is_unique": true,
86
+ "is_sorted": true
87
+ },
88
+ "add_blank": true,
89
+ "batch_group_size": 48,
90
+ "loss_masking": null,
91
+ "min_audio_len": 1,
92
+ "max_audio_len": Infinity,
93
+ "min_text_len": 1,
94
+ "max_text_len": Infinity,
95
+ "compute_f0": false,
96
+ "compute_energy": false,
97
+ "compute_linear_spec": true,
98
+ "precompute_num_workers": 12,
99
+ "start_by_longest": true,
100
+ "shuffle": false,
101
+ "drop_last": false,
102
+ "datasets": [
103
+ {
104
+ "formatter": "coqui",
105
+ "dataset_name": "mupe",
106
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
107
+ "meta_file_train": "metadata_coqui_brpb.csv",
108
+ "ignored_speakers": null,
109
+ "language": "brpb",
110
+ "phonemizer": "",
111
+ "meta_file_val": "",
112
+ "meta_file_attn_mask": ""
113
+ },
114
+ {
115
+ "formatter": "coqui",
116
+ "dataset_name": "mupe",
117
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
118
+ "meta_file_train": "metadata_coqui_brba.csv",
119
+ "ignored_speakers": null,
120
+ "language": "brba",
121
+ "phonemizer": "",
122
+ "meta_file_val": "",
123
+ "meta_file_attn_mask": ""
124
+ },
125
+ {
126
+ "formatter": "coqui",
127
+ "dataset_name": "mupe",
128
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
129
+ "meta_file_train": "metadata_coqui_brportugal.csv",
130
+ "ignored_speakers": null,
131
+ "language": "brportugal",
132
+ "phonemizer": "",
133
+ "meta_file_val": "",
134
+ "meta_file_attn_mask": ""
135
+ },
136
+ {
137
+ "formatter": "coqui",
138
+ "dataset_name": "mupe",
139
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
140
+ "meta_file_train": "metadata_coqui_brsp.csv",
141
+ "ignored_speakers": null,
142
+ "language": "brsp",
143
+ "phonemizer": "",
144
+ "meta_file_val": "",
145
+ "meta_file_attn_mask": ""
146
+ },
147
+ {
148
+ "formatter": "coqui",
149
+ "dataset_name": "mupe",
150
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
151
+ "meta_file_train": "metadata_coqui_brpe.csv",
152
+ "ignored_speakers": null,
153
+ "language": "brpe",
154
+ "phonemizer": "",
155
+ "meta_file_val": "",
156
+ "meta_file_attn_mask": ""
157
+ },
158
+ {
159
+ "formatter": "coqui",
160
+ "dataset_name": "mupe",
161
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
162
+ "meta_file_train": "metadata_coqui_brmg.csv",
163
+ "ignored_speakers": null,
164
+ "language": "brmg",
165
+ "phonemizer": "",
166
+ "meta_file_val": "",
167
+ "meta_file_attn_mask": ""
168
+ },
169
+ {
170
+ "formatter": "coqui",
171
+ "dataset_name": "mupe",
172
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
173
+ "meta_file_train": "metadata_coqui_brrj.csv",
174
+ "ignored_speakers": null,
175
+ "language": "brrj",
176
+ "phonemizer": "",
177
+ "meta_file_val": "",
178
+ "meta_file_attn_mask": ""
179
+ },
180
+ {
181
+ "formatter": "coqui",
182
+ "dataset_name": "mupe",
183
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
184
+ "meta_file_train": "metadata_coqui_brce.csv",
185
+ "ignored_speakers": null,
186
+ "language": "brce",
187
+ "phonemizer": "",
188
+ "meta_file_val": "",
189
+ "meta_file_attn_mask": ""
190
+ },
191
+ {
192
+ "formatter": "coqui",
193
+ "dataset_name": "mupe",
194
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
195
+ "meta_file_train": "metadata_coqui_brrs.csv",
196
+ "ignored_speakers": null,
197
+ "language": "brrs",
198
+ "phonemizer": "",
199
+ "meta_file_val": "",
200
+ "meta_file_attn_mask": ""
201
+ },
202
+ {
203
+ "formatter": "coqui",
204
+ "dataset_name": "mupe",
205
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
206
+ "meta_file_train": "metadata_coqui_bralemanha.csv",
207
+ "ignored_speakers": null,
208
+ "language": "bralemanha",
209
+ "phonemizer": "",
210
+ "meta_file_val": "",
211
+ "meta_file_attn_mask": ""
212
+ },
213
+ {
214
+ "formatter": "coqui",
215
+ "dataset_name": "mupe",
216
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
217
+ "meta_file_train": "metadata_coqui_brgo.csv",
218
+ "ignored_speakers": null,
219
+ "language": "brgo",
220
+ "phonemizer": "",
221
+ "meta_file_val": "",
222
+ "meta_file_attn_mask": ""
223
+ },
224
+ {
225
+ "formatter": "coqui",
226
+ "dataset_name": "mupe",
227
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
228
+ "meta_file_train": "metadata_coqui_bral.csv",
229
+ "ignored_speakers": null,
230
+ "language": "bral",
231
+ "phonemizer": "",
232
+ "meta_file_val": "",
233
+ "meta_file_attn_mask": ""
234
+ },
235
+ {
236
+ "formatter": "coqui",
237
+ "dataset_name": "mupe",
238
+ "path": "/raid/datasets/MUPE/dataset/mupe/",
239
+ "meta_file_train": "metadata_coqui_brpr.csv",
240
+ "ignored_speakers": null,
241
+ "language": "brpr",
242
+ "phonemizer": "",
243
+ "meta_file_val": "",
244
+ "meta_file_attn_mask": ""
245
+ }
246
+ ],
247
+ "test_sentences": [
248
+ [
249
+ "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
250
+ "EDILEINE_FONSECA",
251
+ null,
252
+ "brsp"
253
+ ],
254
+ [
255
+ "Quem semeia ventos, colhe tempestades.",
256
+ "JOSE_PAULO_DE_ARAUJO",
257
+ null,
258
+ "brpb"
259
+ ],
260
+ [
261
+ "O olho do dono \u00e9 que engorda o gado.",
262
+ "VITOR_RAFAEL_OLIVEIRA_ALVES",
263
+ null,
264
+ "brba"
265
+ ],
266
+ [
267
+ "\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
268
+ "MARIA_AURORA_FELIX",
269
+ null,
270
+ "brportugal"
271
+ ],
272
+ [
273
+ "Quem espera sempre alcan\u00e7a.",
274
+ "ANTONIO_DE_AMORIM_COSTA",
275
+ null,
276
+ "brpe"
277
+ ],
278
+ [
279
+ "Cada macaco no seu galho.",
280
+ "ALCIDES_DE_LIMA",
281
+ null,
282
+ "brmg"
283
+ ],
284
+ [
285
+ "Em terra de cego, quem tem um olho \u00e9 rei.",
286
+ "ALUISIO_SOARES_DE_SOUSA",
287
+ null,
288
+ "brrj"
289
+ ],
290
+ [
291
+ "A ocasi\u00e3o faz o ladr\u00e3o.",
292
+ "FRANCISCO_JOSE_MOREIRA_MOTA",
293
+ null,
294
+ "brce"
295
+ ],
296
+ [
297
+ "De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
298
+ "EVALDO_ANDRADA_CORREA",
299
+ null,
300
+ "brrs"
301
+ ],
302
+ [
303
+ "Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
304
+ "DORIS_ALEXANDER",
305
+ null,
306
+ "bralemanha"
307
+ ],
308
+ [
309
+ "Quem n\u00e3o arrisca, n\u00e3o petisca.",
310
+ "DONALDO_LUIZ_DE_ALMEIDA",
311
+ null,
312
+ "brgo"
313
+ ],
314
+ [
315
+ "A uni\u00e3o faz a for\u00e7a.",
316
+ "GERONCIO_HENRIQUE_NETO",
317
+ null,
318
+ "bral"
319
+ ],
320
+ [
321
+ "Em boca fechada n\u00e3o entra mosquito.",
322
+ "MALU_NATEL_FREIRE_WEBER",
323
+ null,
324
+ "brpr"
325
+ ]
326
+ ],
327
+ "eval_split_max_size": 256,
328
+ "eval_split_size": 0.01,
329
+ "use_speaker_weighted_sampler": false,
330
+ "speaker_weighted_sampler_alpha": 1.0,
331
+ "use_language_weighted_sampler": false,
332
+ "language_weighted_sampler_alpha": 1.0,
333
+ "use_length_weighted_sampler": false,
334
+ "length_weighted_sampler_alpha": 1.0,
335
+ "model_args": {
336
+ "num_chars": 266,
337
+ "out_channels": 513,
338
+ "spec_segment_size": 62,
339
+ "hidden_channels": 192,
340
+ "use_adaptive_weight_text_encoder": true,
341
+ "use_perfect_class_batch_sampler": true,
342
+ "perfect_class_batch_sampler_key": "language",
343
+ "hidden_channels_ffn_text_encoder": 768,
344
+ "num_heads_text_encoder": 2,
345
+ "num_layers_text_encoder": 10,
346
+ "kernel_size_text_encoder": 3,
347
+ "dropout_p_text_encoder": 0.1,
348
+ "dropout_p_duration_predictor": 0.5,
349
+ "kernel_size_posterior_encoder": 5,
350
+ "dilation_rate_posterior_encoder": 1,
351
+ "num_layers_posterior_encoder": 16,
352
+ "kernel_size_flow": 5,
353
+ "dilation_rate_flow": 1,
354
+ "num_layers_flow": 4,
355
+ "resblock_type_decoder": "2",
356
+ "resblock_kernel_sizes_decoder": [
357
+ 3,
358
+ 7,
359
+ 11
360
+ ],
361
+ "resblock_dilation_sizes_decoder": [
362
+ [
363
+ 1,
364
+ 3,
365
+ 5
366
+ ],
367
+ [
368
+ 1,
369
+ 3,
370
+ 5
371
+ ],
372
+ [
373
+ 1,
374
+ 3,
375
+ 5
376
+ ]
377
+ ],
378
+ "upsample_rates_decoder": [
379
+ 8,
380
+ 8,
381
+ 2,
382
+ 2
383
+ ],
384
+ "upsample_initial_channel_decoder": 512,
385
+ "upsample_kernel_sizes_decoder": [
386
+ 16,
387
+ 16,
388
+ 4,
389
+ 4
390
+ ],
391
+ "periods_multi_period_discriminator": [
392
+ 2,
393
+ 3,
394
+ 5,
395
+ 7,
396
+ 11
397
+ ],
398
+ "use_sdp": true,
399
+ "noise_scale": 1.0,
400
+ "inference_noise_scale": 0.667,
401
+ "length_scale": 1,
402
+ "noise_scale_dp": 1.0,
403
+ "inference_noise_scale_dp": 1.0,
404
+ "max_inference_len": null,
405
+ "init_discriminator": true,
406
+ "use_spectral_norm_disriminator": false,
407
+ "use_speaker_embedding": false,
408
+ "num_speakers": 0,
409
+ "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/speakers.pth",
410
+ "d_vector_file": [
411
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
412
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
413
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
414
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
415
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
416
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
417
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
418
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
419
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
420
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
421
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
422
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
423
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
424
+ ],
425
+ "speaker_embedding_channels": 256,
426
+ "use_d_vector_file": true,
427
+ "d_vector_dim": 512,
428
+ "detach_dp_input": true,
429
+ "use_language_embedding": false,
430
+ "embedded_language_dim": 4,
431
+ "num_languages": 0,
432
+ "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/language_ids.json",
433
+ "use_speaker_encoder_as_loss": false,
434
+ "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
435
+ "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
436
+ "condition_dp_on_speaker": true,
437
+ "freeze_encoder": false,
438
+ "freeze_DP": false,
439
+ "freeze_PE": false,
440
+ "freeze_flow_decoder": false,
441
+ "freeze_waveform_decoder": false,
442
+ "encoder_sample_rate": null,
443
+ "interpolate_z": true,
444
+ "reinit_DP": false,
445
+ "reinit_text_encoder": false
446
+ },
447
+ "lr_gen": 0.0002,
448
+ "lr_disc": 0.0002,
449
+ "lr_scheduler_gen": "ExponentialLR",
450
+ "lr_scheduler_gen_params": {
451
+ "gamma": 0.999875,
452
+ "last_epoch": -1
453
+ },
454
+ "lr_scheduler_disc": "ExponentialLR",
455
+ "lr_scheduler_disc_params": {
456
+ "gamma": 0.999875,
457
+ "last_epoch": -1
458
+ },
459
+ "kl_loss_alpha": 1.0,
460
+ "disc_loss_alpha": 1.0,
461
+ "gen_loss_alpha": 1.0,
462
+ "feat_loss_alpha": 1.0,
463
+ "mel_loss_alpha": 45.0,
464
+ "dur_loss_alpha": 1.0,
465
+ "speaker_encoder_loss_alpha": 9.0,
466
+ "return_wav": true,
467
+ "use_weighted_sampler": true,
468
+ "weighted_sampler_attrs": {
469
+ "language": 1.0
470
+ },
471
+ "weighted_sampler_multipliers": {},
472
+ "r": 1,
473
+ "num_speakers": 0,
474
+ "use_speaker_embedding": false,
475
+ "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/speakers.pth",
476
+ "speaker_embedding_channels": 256,
477
+ "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/language_ids.json",
478
+ "use_language_embedding": false,
479
+ "use_d_vector_file": true,
480
+ "d_vector_file": [
481
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
482
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
483
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
484
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
485
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
486
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
487
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
488
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
489
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
490
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
491
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
492
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
493
+ "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
494
+ ],
495
+ "d_vector_dim": 512
496
+ }
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/language_ids.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bral": 0,
3
+ "bralemanha": 1,
4
+ "brba": 2,
5
+ "brce": 3,
6
+ "brgo": 4,
7
+ "brmg": 5,
8
+ "brpb": 6,
9
+ "brpe": 7,
10
+ "brportugal": 8,
11
+ "brpr": 9,
12
+ "brrj": 10,
13
+ "brrs": 11,
14
+ "brsp": 12
15
+ }
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/speakers.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
3
+ size 3296
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/train_syntacc.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.bin.compute_embeddings import compute_embeddings
7
+ from TTS.bin.resample import resample_files
8
+ from TTS.config.shared_configs import BaseDatasetConfig
9
+ from TTS.tts.configs.vits_config import VitsConfig
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
12
+ from TTS.utils.downloaders import download_libri_tts
13
+ from torch.utils.data import DataLoader
14
+ from TTS.utils.samplers import PerfectBatchSampler
15
+ torch.set_num_threads(24)
16
+
17
+ # pylint: disable=W0105
18
+ """
19
+ This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
20
+ YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
21
+ """
22
+ CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
23
+
24
+ # Name of the run for the Trainer
25
+ RUN_NAME = "YourTTS-Syntacc-PT"
26
+
27
+ # Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
28
+ OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
29
+
30
+ # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
31
+ RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
32
+
33
+ # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
34
+ SKIP_TRAIN_EPOCH = False
35
+
36
+ # Set here the batch size to be used in training and evaluation
37
+ BATCH_SIZE = 26
38
+
39
+ # Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
40
+ # Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
41
+ SAMPLE_RATE = 16000
42
+
43
+
44
+ DASHBOARD_LOGGER="tensorboard"
45
+ LOGGER_URI = None
46
+
47
+ DASHBOARD_LOGGER = "clearml"
48
+ LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
49
+
50
+
51
+
52
+ # Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
53
+ MAX_AUDIO_LEN_IN_SECONDS = float("inf")
54
+
55
+ # Define here the datasets config
56
+ brpb_train_config = BaseDatasetConfig(
57
+ formatter="coqui",
58
+ dataset_name="mupe",
59
+ meta_file_train="metadata_coqui_brpb.csv",
60
+ path="/raid/datasets/MUPE/dataset/mupe/",
61
+ language="brpb"
62
+ )
63
+
64
+ brba_train_config = BaseDatasetConfig(
65
+ formatter="coqui",
66
+ dataset_name="mupe",
67
+ meta_file_train="metadata_coqui_brba.csv",
68
+ path="/raid/datasets/MUPE/dataset/mupe/",
69
+ language="brba"
70
+ )
71
+
72
+ brportugal_train_config = BaseDatasetConfig(
73
+ formatter="coqui",
74
+ dataset_name="mupe",
75
+ meta_file_train="metadata_coqui_brportugal.csv",
76
+ path="/raid/datasets/MUPE/dataset/mupe/",
77
+ language="brportugal"
78
+ )
79
+
80
+ brsp_train_config = BaseDatasetConfig(
81
+ formatter="coqui",
82
+ dataset_name="mupe",
83
+ meta_file_train="metadata_coqui_brsp.csv",
84
+ path="/raid/datasets/MUPE/dataset/mupe/",
85
+ language="brsp"
86
+ )
87
+
88
+ brpe_train_config = BaseDatasetConfig(
89
+ formatter="coqui",
90
+ dataset_name="mupe",
91
+ meta_file_train="metadata_coqui_brpe.csv",
92
+ path="/raid/datasets/MUPE/dataset/mupe/",
93
+ language="brpe"
94
+ )
95
+
96
+ brmg_train_config = BaseDatasetConfig(
97
+ formatter="coqui",
98
+ dataset_name="mupe",
99
+ meta_file_train="metadata_coqui_brmg.csv",
100
+ path="/raid/datasets/MUPE/dataset/mupe/",
101
+ language="brmg"
102
+ )
103
+
104
+ brrj_train_config = BaseDatasetConfig(
105
+ formatter="coqui",
106
+ dataset_name="mupe",
107
+ meta_file_train="metadata_coqui_brrj.csv",
108
+ path="/raid/datasets/MUPE/dataset/mupe/",
109
+ language="brrj"
110
+ )
111
+
112
+ brce_train_config = BaseDatasetConfig(
113
+ formatter="coqui",
114
+ dataset_name="mupe",
115
+ meta_file_train="metadata_coqui_brce.csv",
116
+ path="/raid/datasets/MUPE/dataset/mupe/",
117
+ language="brce"
118
+ )
119
+
120
+ brrs_train_config = BaseDatasetConfig(
121
+ formatter="coqui",
122
+ dataset_name="mupe",
123
+ meta_file_train="metadata_coqui_brrs.csv",
124
+ path="/raid/datasets/MUPE/dataset/mupe/",
125
+ language="brrs"
126
+ )
127
+
128
+ bralemanha_train_config = BaseDatasetConfig(
129
+ formatter="coqui",
130
+ dataset_name="mupe",
131
+ meta_file_train="metadata_coqui_bralemanha.csv",
132
+ path="/raid/datasets/MUPE/dataset/mupe/",
133
+ language="bralemanha"
134
+ )
135
+
136
+ brgo_train_config = BaseDatasetConfig(
137
+ formatter="coqui",
138
+ dataset_name="mupe",
139
+ meta_file_train="metadata_coqui_brgo.csv",
140
+ path="/raid/datasets/MUPE/dataset/mupe/",
141
+ language="brgo"
142
+ )
143
+
144
+ bral_train_config = BaseDatasetConfig(
145
+ formatter="coqui",
146
+ dataset_name="mupe",
147
+ meta_file_train="metadata_coqui_bral.csv",
148
+ path="/raid/datasets/MUPE/dataset/mupe/",
149
+ language="bral"
150
+ )
151
+
152
+ brpr_train_config = BaseDatasetConfig(
153
+ formatter="coqui",
154
+ dataset_name="mupe",
155
+ meta_file_train="metadata_coqui_brpr.csv",
156
+ path="/raid/datasets/MUPE/dataset/mupe/",
157
+ language="brpr"
158
+ )
159
+
160
+ bres_train_config = BaseDatasetConfig(
161
+ formatter="coqui",
162
+ dataset_name="mupe",
163
+ meta_file_train="metadata_coqui_bres.csv",
164
+ path="/raid/datasets/MUPE/dataset/mupe/",
165
+ language="bres"
166
+ )
167
+
168
+ brpi_train_config = BaseDatasetConfig(
169
+ formatter="coqui",
170
+ dataset_name="mupe",
171
+ meta_file_train="metadata_coqui_brpi.csv",
172
+ path="/raid/datasets/MUPE/dataset/mupe/",
173
+ language="brpi"
174
+ )
175
+
176
+ # bres_train_config, brpi_train_config no files found
177
+ DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
178
+
179
+
180
+ ### Extract speaker embeddings
181
+ SPEAKER_ENCODER_CHECKPOINT_PATH = (
182
+ "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
183
+ )
184
+ SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
185
+
186
+ D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
187
+
188
+ # Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
189
+ for dataset_conf in DATASETS_CONFIG_LIST:
190
+ # Check if the embeddings weren't already computed, if not compute it
191
+ embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
192
+ if not os.path.isfile(embeddings_file):
193
+ print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
194
+ compute_embeddings(
195
+ SPEAKER_ENCODER_CHECKPOINT_PATH,
196
+ SPEAKER_ENCODER_CONFIG_PATH,
197
+ embeddings_file,
198
+ old_speakers_file=None,
199
+ config_dataset_path=None,
200
+ formatter_name=dataset_conf.formatter,
201
+ dataset_name=dataset_conf.dataset_name,
202
+ dataset_path=dataset_conf.path,
203
+ meta_file_train=dataset_conf.meta_file_train,
204
+ meta_file_val=dataset_conf.meta_file_val,
205
+ disable_cuda=False,
206
+ no_eval=False,
207
+ )
208
+ D_VECTOR_FILES.append(embeddings_file)
209
+
210
+
211
+ # Audio config used in training.
212
+ audio_config = VitsAudioConfig(
213
+ sample_rate=SAMPLE_RATE,
214
+ hop_length=256,
215
+ win_length=1024,
216
+ fft_size=1024,
217
+ mel_fmin=0.0,
218
+ mel_fmax=None,
219
+ num_mels=80,
220
+ )
221
+
222
+ # Init VITSArgs setting the arguments that are needed for the YourTTS model
223
+ model_args = VitsArgs(
224
+ spec_segment_size=62,
225
+ hidden_channels=192,
226
+ hidden_channels_ffn_text_encoder=768,
227
+ num_heads_text_encoder=2,
228
+ num_layers_text_encoder=10,
229
+ kernel_size_text_encoder=3,
230
+ dropout_p_text_encoder=0.1,
231
+ d_vector_file=D_VECTOR_FILES,
232
+ use_d_vector_file=True,
233
+ d_vector_dim=512,
234
+ speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
235
+ speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
236
+ resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
237
+ # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
238
+ use_speaker_encoder_as_loss=False,
239
+ # Useful parameters to enable multilingual training
240
+ use_language_embedding=False,
241
+ embedded_language_dim=4,
242
+ use_adaptive_weight_text_encoder=True,
243
+ use_perfect_class_batch_sampler=True,
244
+ perfect_class_batch_sampler_key="language"
245
+ )
246
+
247
+ # General training config, here you can change the batch size and others useful parameters
248
+ config = VitsConfig(
249
+ output_path=OUT_PATH,
250
+ model_args=model_args,
251
+ run_name=RUN_NAME,
252
+ project_name="SYNTACC",
253
+ run_description="""
254
+ - YourTTS with SYNTACC text encoder
255
+ """,
256
+ dashboard_logger=DASHBOARD_LOGGER,
257
+ logger_uri=LOGGER_URI,
258
+ audio=audio_config,
259
+ batch_size=BATCH_SIZE,
260
+ batch_group_size=48,
261
+ eval_batch_size=BATCH_SIZE,
262
+ num_loader_workers=8,
263
+ eval_split_max_size=256,
264
+ print_step=50,
265
+ plot_step=100,
266
+ log_model_step=1000,
267
+ save_step=5000,
268
+ save_n_checkpoints=2,
269
+ save_checkpoints=True,
270
+ # target_loss="loss_1",
271
+ print_eval=False,
272
+ use_phonemes=False,
273
+ phonemizer="espeak",
274
+ phoneme_language="en",
275
+ compute_input_seq_cache=True,
276
+ add_blank=True,
277
+ text_cleaner="multilingual_cleaners",
278
+ characters=CharactersConfig(
279
+ characters_class="TTS.tts.models.vits.VitsCharacters",
280
+ pad="_",
281
+ eos="&",
282
+ bos="*",
283
+ blank=None,
284
+ characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
285
+ punctuations="\u2014!'(),-.:;?\u00bf ",
286
+ phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
287
+ is_unique=True,
288
+ is_sorted=True,
289
+ ),
290
+ phoneme_cache_path=None,
291
+ precompute_num_workers=12,
292
+ start_by_longest=True,
293
+ datasets=DATASETS_CONFIG_LIST,
294
+ cudnn_benchmark=False,
295
+ max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
296
+ mixed_precision=False,
297
+ test_sentences=[
298
+ #GUSTAVO: apenas pessoas do treino
299
+ ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
300
+ ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
301
+ ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
302
+ ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
303
+ ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
304
+ ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
305
+ ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
306
+ ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
307
+ ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
308
+ ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
309
+ ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
310
+ ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
311
+ ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
312
+ # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
313
+ # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
314
+ ],
315
+ # Enable the weighted sampler
316
+ use_weighted_sampler=True,
317
+ # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
318
+ # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
319
+ weighted_sampler_attrs={"language": 1.0},
320
+ weighted_sampler_multipliers={
321
+ # "speaker_name": {
322
+ # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
323
+ # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
324
+ # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
325
+ # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
326
+ # }
327
+ },
328
+ # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
329
+ speaker_encoder_loss_alpha=9.0,
330
+ )
331
+
332
+ # Load all the datasets samples and split traning and evaluation sets
333
+ train_samples, eval_samples = load_tts_samples(
334
+ config.datasets,
335
+ eval_split=True,
336
+ eval_split_max_size=config.eval_split_max_size,
337
+ eval_split_size=config.eval_split_size,
338
+ )
339
+
340
+ # Init the model
341
+ model = Vits.init_from_config(config)
342
+
343
+ # Init the trainer and 🚀
344
+ trainer = Trainer(
345
+ TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
346
+ config,
347
+ output_path=OUT_PATH,
348
+ model=model,
349
+ train_samples=train_samples,
350
+ eval_samples=eval_samples,
351
+ )
352
+ trainer.fit()
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/trainer_0_log.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc570d138b2c5d578e697b92e2d2d060c0945fcd0f880761a3fc800eaf619b6a
3
+ size 97918
Experiments/train_syntacc.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.bin.compute_embeddings import compute_embeddings
7
+ from TTS.bin.resample import resample_files
8
+ from TTS.config.shared_configs import BaseDatasetConfig
9
+ from TTS.tts.configs.vits_config import VitsConfig
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
12
+ from TTS.utils.downloaders import download_libri_tts
13
+ from torch.utils.data import DataLoader
14
+ from TTS.utils.samplers import PerfectBatchSampler
15
+ torch.set_num_threads(24)
16
+
17
+ # pylint: disable=W0105
18
+ """
19
+ This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
20
+ YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
21
+ """
22
+ CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
23
+
24
+ # Name of the run for the Trainer
25
+ RUN_NAME = "YourTTS-Syntacc-PT"
26
+
27
+ # Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
28
+ OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs") # "/raid/coqui/Checkpoints/original-YourTTS/"
29
+
30
+ # If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
31
+ RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth" # Download the checkpoint here: https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
32
+
33
+ # This paramter is useful to debug, it skips the training epochs and just do the evaluation and produce the test sentences
34
+ SKIP_TRAIN_EPOCH = False
35
+
36
+ # Set here the batch size to be used in training and evaluation
37
+ BATCH_SIZE = 26
38
+
39
+ # Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
40
+ # Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
41
+ SAMPLE_RATE = 16000
42
+
43
+
44
+ DASHBOARD_LOGGER="tensorboard"
45
+ LOGGER_URI = None
46
+
47
+ DASHBOARD_LOGGER = "clearml"
48
+ LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
49
+
50
+
51
+
52
+ # Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
53
+ MAX_AUDIO_LEN_IN_SECONDS = float("inf")
54
+
55
+ # Define here the datasets config
56
+ brpb_train_config = BaseDatasetConfig(
57
+ formatter="coqui",
58
+ dataset_name="mupe",
59
+ meta_file_train="metadata_coqui_brpb.csv",
60
+ path="/raid/datasets/MUPE/dataset/mupe/",
61
+ language="brpb"
62
+ )
63
+
64
+ brba_train_config = BaseDatasetConfig(
65
+ formatter="coqui",
66
+ dataset_name="mupe",
67
+ meta_file_train="metadata_coqui_brba.csv",
68
+ path="/raid/datasets/MUPE/dataset/mupe/",
69
+ language="brba"
70
+ )
71
+
72
+ brportugal_train_config = BaseDatasetConfig(
73
+ formatter="coqui",
74
+ dataset_name="mupe",
75
+ meta_file_train="metadata_coqui_brportugal.csv",
76
+ path="/raid/datasets/MUPE/dataset/mupe/",
77
+ language="brportugal"
78
+ )
79
+
80
+ brsp_train_config = BaseDatasetConfig(
81
+ formatter="coqui",
82
+ dataset_name="mupe",
83
+ meta_file_train="metadata_coqui_brsp.csv",
84
+ path="/raid/datasets/MUPE/dataset/mupe/",
85
+ language="brsp"
86
+ )
87
+
88
+ brpe_train_config = BaseDatasetConfig(
89
+ formatter="coqui",
90
+ dataset_name="mupe",
91
+ meta_file_train="metadata_coqui_brpe.csv",
92
+ path="/raid/datasets/MUPE/dataset/mupe/",
93
+ language="brpe"
94
+ )
95
+
96
+ brmg_train_config = BaseDatasetConfig(
97
+ formatter="coqui",
98
+ dataset_name="mupe",
99
+ meta_file_train="metadata_coqui_brmg.csv",
100
+ path="/raid/datasets/MUPE/dataset/mupe/",
101
+ language="brmg"
102
+ )
103
+
104
+ brrj_train_config = BaseDatasetConfig(
105
+ formatter="coqui",
106
+ dataset_name="mupe",
107
+ meta_file_train="metadata_coqui_brrj.csv",
108
+ path="/raid/datasets/MUPE/dataset/mupe/",
109
+ language="brrj"
110
+ )
111
+
112
+ brce_train_config = BaseDatasetConfig(
113
+ formatter="coqui",
114
+ dataset_name="mupe",
115
+ meta_file_train="metadata_coqui_brce.csv",
116
+ path="/raid/datasets/MUPE/dataset/mupe/",
117
+ language="brce"
118
+ )
119
+
120
+ brrs_train_config = BaseDatasetConfig(
121
+ formatter="coqui",
122
+ dataset_name="mupe",
123
+ meta_file_train="metadata_coqui_brrs.csv",
124
+ path="/raid/datasets/MUPE/dataset/mupe/",
125
+ language="brrs"
126
+ )
127
+
128
+ bralemanha_train_config = BaseDatasetConfig(
129
+ formatter="coqui",
130
+ dataset_name="mupe",
131
+ meta_file_train="metadata_coqui_bralemanha.csv",
132
+ path="/raid/datasets/MUPE/dataset/mupe/",
133
+ language="bralemanha"
134
+ )
135
+
136
+ brgo_train_config = BaseDatasetConfig(
137
+ formatter="coqui",
138
+ dataset_name="mupe",
139
+ meta_file_train="metadata_coqui_brgo.csv",
140
+ path="/raid/datasets/MUPE/dataset/mupe/",
141
+ language="brgo"
142
+ )
143
+
144
+ bral_train_config = BaseDatasetConfig(
145
+ formatter="coqui",
146
+ dataset_name="mupe",
147
+ meta_file_train="metadata_coqui_bral.csv",
148
+ path="/raid/datasets/MUPE/dataset/mupe/",
149
+ language="bral"
150
+ )
151
+
152
+ brpr_train_config = BaseDatasetConfig(
153
+ formatter="coqui",
154
+ dataset_name="mupe",
155
+ meta_file_train="metadata_coqui_brpr.csv",
156
+ path="/raid/datasets/MUPE/dataset/mupe/",
157
+ language="brpr"
158
+ )
159
+
160
+ bres_train_config = BaseDatasetConfig(
161
+ formatter="coqui",
162
+ dataset_name="mupe",
163
+ meta_file_train="metadata_coqui_bres.csv",
164
+ path="/raid/datasets/MUPE/dataset/mupe/",
165
+ language="bres"
166
+ )
167
+
168
+ brpi_train_config = BaseDatasetConfig(
169
+ formatter="coqui",
170
+ dataset_name="mupe",
171
+ meta_file_train="metadata_coqui_brpi.csv",
172
+ path="/raid/datasets/MUPE/dataset/mupe/",
173
+ language="brpi"
174
+ )
175
+
176
+ # bres_train_config, brpi_train_config no files found
177
+ DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
178
+
179
+
180
+ ### Extract speaker embeddings
181
+ SPEAKER_ENCODER_CHECKPOINT_PATH = (
182
+ "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
183
+ )
184
+ SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
185
+
186
+ D_VECTOR_FILES = [] # List of speaker embeddings/d-vectors to be used during the training
187
+
188
+ # Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
189
+ for dataset_conf in DATASETS_CONFIG_LIST:
190
+ # Check if the embeddings weren't already computed, if not compute it
191
+ embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
192
+ if not os.path.isfile(embeddings_file):
193
+ print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
194
+ compute_embeddings(
195
+ SPEAKER_ENCODER_CHECKPOINT_PATH,
196
+ SPEAKER_ENCODER_CONFIG_PATH,
197
+ embeddings_file,
198
+ old_speakers_file=None,
199
+ config_dataset_path=None,
200
+ formatter_name=dataset_conf.formatter,
201
+ dataset_name=dataset_conf.dataset_name,
202
+ dataset_path=dataset_conf.path,
203
+ meta_file_train=dataset_conf.meta_file_train,
204
+ meta_file_val=dataset_conf.meta_file_val,
205
+ disable_cuda=False,
206
+ no_eval=False,
207
+ )
208
+ D_VECTOR_FILES.append(embeddings_file)
209
+
210
+
211
+ # Audio config used in training.
212
+ audio_config = VitsAudioConfig(
213
+ sample_rate=SAMPLE_RATE,
214
+ hop_length=256,
215
+ win_length=1024,
216
+ fft_size=1024,
217
+ mel_fmin=0.0,
218
+ mel_fmax=None,
219
+ num_mels=80,
220
+ )
221
+
222
+ # Init VITSArgs setting the arguments that are needed for the YourTTS model
223
+ model_args = VitsArgs(
224
+ spec_segment_size=62,
225
+ hidden_channels=192,
226
+ hidden_channels_ffn_text_encoder=768,
227
+ num_heads_text_encoder=2,
228
+ num_layers_text_encoder=10,
229
+ kernel_size_text_encoder=3,
230
+ dropout_p_text_encoder=0.1,
231
+ d_vector_file=D_VECTOR_FILES,
232
+ use_d_vector_file=True,
233
+ d_vector_dim=512,
234
+ speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
235
+ speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
236
+ resblock_type_decoder="2", # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
237
+ # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
238
+ use_speaker_encoder_as_loss=False,
239
+ # Useful parameters to enable multilingual training
240
+ use_language_embedding=False,
241
+ embedded_language_dim=4,
242
+ use_adaptive_weight_text_encoder=True,
243
+ use_perfect_class_batch_sampler=True,
244
+ perfect_class_batch_sampler_key="language"
245
+ )
246
+
247
+ # General training config, here you can change the batch size and others useful parameters
248
+ config = VitsConfig(
249
+ output_path=OUT_PATH,
250
+ model_args=model_args,
251
+ run_name=RUN_NAME,
252
+ project_name="SYNTACC",
253
+ run_description="""
254
+ - YourTTS with SYNTACC text encoder
255
+ """,
256
+ dashboard_logger=DASHBOARD_LOGGER,
257
+ logger_uri=LOGGER_URI,
258
+ audio=audio_config,
259
+ batch_size=BATCH_SIZE,
260
+ batch_group_size=48,
261
+ eval_batch_size=BATCH_SIZE,
262
+ num_loader_workers=8,
263
+ eval_split_max_size=256,
264
+ print_step=50,
265
+ plot_step=100,
266
+ log_model_step=1000,
267
+ save_step=5000,
268
+ save_n_checkpoints=2,
269
+ save_checkpoints=True,
270
+ # target_loss="loss_1",
271
+ print_eval=False,
272
+ use_phonemes=False,
273
+ phonemizer="espeak",
274
+ phoneme_language="en",
275
+ compute_input_seq_cache=True,
276
+ add_blank=True,
277
+ text_cleaner="multilingual_cleaners",
278
+ characters=CharactersConfig(
279
+ characters_class="TTS.tts.models.vits.VitsCharacters",
280
+ pad="_",
281
+ eos="&",
282
+ bos="*",
283
+ blank=None,
284
+ characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
285
+ punctuations="\u2014!'(),-.:;?\u00bf ",
286
+ phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
287
+ is_unique=True,
288
+ is_sorted=True,
289
+ ),
290
+ phoneme_cache_path=None,
291
+ precompute_num_workers=12,
292
+ start_by_longest=True,
293
+ datasets=DATASETS_CONFIG_LIST,
294
+ cudnn_benchmark=False,
295
+ max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
296
+ mixed_precision=False,
297
+ test_sentences=[
298
+ #GUSTAVO: apenas pessoas do treino
299
+ ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
300
+ ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
301
+ ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
302
+ ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
303
+ ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
304
+ ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
305
+ ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
306
+ ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
307
+ ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
308
+ ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
309
+ ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
310
+ ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
311
+ ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
312
+ # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
313
+ # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
314
+ ],
315
+ # Enable the weighted sampler
316
+ use_weighted_sampler=True,
317
+ # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
318
+ # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
319
+ weighted_sampler_attrs={"language": 1.0},
320
+ weighted_sampler_multipliers={
321
+ # "speaker_name": {
322
+ # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
323
+ # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
324
+ # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
325
+ # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
326
+ # }
327
+ },
328
+ # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
329
+ speaker_encoder_loss_alpha=9.0,
330
+ )
331
+
332
+ # Load all the datasets samples and split traning and evaluation sets
333
+ train_samples, eval_samples = load_tts_samples(
334
+ config.datasets,
335
+ eval_split=True,
336
+ eval_split_max_size=config.eval_split_max_size,
337
+ eval_split_size=config.eval_split_size,
338
+ )
339
+
340
+ # Init the model
341
+ model = Vits.init_from_config(config)
342
+
343
+ # Init the trainer and 🚀
344
+ trainer = Trainer(
345
+ TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
346
+ config,
347
+ output_path=OUT_PATH,
348
+ model=model,
349
+ train_samples=train_samples,
350
+ eval_samples=eval_samples,
351
+ )
352
+ trainer.fit()