Edresson commited on Jan 28, 2024

Commit

1dfa9fd

1 Parent(s): 2bc0892

Update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +16 -34
.gitignore +3 -0
Experiments/TTS +1 -0
Experiments/nohup.out +3 -0
Experiments/run/events.out.tfevents.1706367627.edresson-train-80.45395.0 +3 -0
Experiments/run/events.out.tfevents.1706367849.edresson-train-80.46052.0 +3 -0
Experiments/run/events.out.tfevents.1706367954.edresson-train-80.46941.0 +3 -0
Experiments/run/events.out.tfevents.1706446227.edresson-train-80.140666.0 +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model_124752.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_130000.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_135000.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/config.json +496 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json +15 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/train_syntacc_baseline.py +352 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/trainer_0_log.txt +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model_85001.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/config.json +496 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json +15 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/train_syntacc_baseline.py +352 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/trainer_0_log.txt +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model_87192.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_130000.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_135000.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/config.json +496 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json +15 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth +3 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/train_syntacc_baseline.py +352 -0
Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/trainer_0_log.txt +3 -0
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/best_model.pth +3 -0
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/best_model_78415.pth +3 -0
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_80000.pth +3 -0
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth +3 -0
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/config.json +496 -0
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/language_ids.json +15 -0
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/speakers.pth +3 -0
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/train_syntacc.py +352 -0
Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/trainer_0_log.txt +3 -0
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/best_model.pth +3 -0
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/best_model_87818.pth +3 -0
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/config.json +496 -0
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/language_ids.json +15 -0
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/speakers.pth +3 -0
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/train_syntacc.py +352 -0
Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/trainer_0_log.txt +3 -0
Experiments/train_syntacc.py +352 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,17 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.txt filter=lfs diff=lfs merge=lfs -text
+*.t7 filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.out filter=lfs diff=lfs merge=lfs -text
+*.0 filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
+*.o filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+HierSpeech_TTS/denoiser/g_best filter=lfs diff=lfs merge=lfs -text
+g_best filter=lfs diff=lfs merge=lfs -text
+TTS-private/nohup.out filter=lfs diff=lfs merge=lfs -text
+nohup.out filter=lfs diff=lfs merge=lfs -text
+TTS-private/run/events.out.tfevents.1705084461.edresson-train-80-2.93786.0 filter=lfs diff=lfs merge=lfs -text
+events.out.tfevents.1705084461.edresson-train-80-2.93786.0 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*/.git/*
+.git
+.git/*

Experiments/TTS ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit a45dfd62668cd5778dd6a384308097ba0370c034

Experiments/nohup.out ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b67f0b1dbf0a04937b9b6db1e55de4cc6057c5f1832ebb1a8c4e3c2f4b5a9e6
+size 19940074

Experiments/run/events.out.tfevents.1706367627.edresson-train-80.45395.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa9dfc334b721e0bc90371cdf519b1bb244a637f4c56ce0ef949f76b5848ee8d
+size 347255243

Experiments/run/events.out.tfevents.1706367849.edresson-train-80.46052.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90283b4420a0282755e4425857d26cf58a98b7229057b9e6a5aca19014168184
+size 1238111

Experiments/run/events.out.tfevents.1706367954.edresson-train-80.46941.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fb96459ffb6c55b7e51e5ee3a13a2f265f2c1579a78756e789583906320e81e
+size 350277161

Experiments/run/events.out.tfevents.1706446227.edresson-train-80.140666.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15518ce7cf33a4ff76601734d638436b43895ab61e6b4efe25b5a24b495f529c
+size 21123264

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c62e29c7a1dd4f701ab4998e0b1f569cfe7486cc7806f149c1ff857f172383e0
+size 1043220702

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/best_model_124752.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c62e29c7a1dd4f701ab4998e0b1f569cfe7486cc7806f149c1ff857f172383e0
+size 1043220702

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_130000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a71ead47e605fc525b264ad882fd54630c15a42eb69aaf88993d26d5ea84ae3b
+size 1043220766

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/checkpoint_135000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96e16ee83729813041c17f6edf8a702bdf59e7afe345cfad1fe65dd4ba0b1fce
+size 1043220766

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/config.json ADDED Viewed

	@@ -0,0 +1,496 @@

+{
+    "output_path": "/raid/datasets/MUPE/Experiments/runs",
+    "logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
+    "run_name": "YourTTS-Baseline-PT",
+    "project_name": "SYNTACC",
+    "run_description": "\n            - YourTTS with SYNTACC text encoder\n        ",
+    "print_step": 50,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "clearml",
+    "save_on_interrupt": true,
+    "log_model_step": 1000,
+    "save_step": 5000,
+    "save_n_checkpoints": 2,
+    "save_checkpoints": true,
+    "save_all_best": false,
+    "save_best_after": 10000,
+    "target_loss": null,
+    "print_eval": false,
+    "test_delay_epochs": 0,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": false,
+    "precision": "fp16",
+    "epochs": 1000,
+    "batch_size": 26,
+    "eval_batch_size": 26,
+    "grad_clip": [
+        1000,
+        1000
+    ],
+    "scheduler_after_epoch": true,
+    "lr": 0.001,
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": null,
+    "lr_scheduler_params": {},
+    "use_grad_scaler": false,
+    "allow_tf32": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "vits",
+    "num_loader_workers": 8,
+    "num_eval_loader_workers": 0,
+    "use_noise_augment": false,
+    "audio": {
+        "fft_size": 1024,
+        "sample_rate": 16000,
+        "win_length": 1024,
+        "hop_length": 256,
+        "num_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "use_phonemes": false,
+    "phonemizer": "espeak",
+    "phoneme_language": "en",
+    "compute_input_seq_cache": true,
+    "text_cleaner": "multilingual_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": null,
+    "characters": {
+        "characters_class": "TTS.tts.models.vits.VitsCharacters",
+        "vocab_dict": null,
+        "pad": "_",
+        "eos": "&",
+        "bos": "*",
+        "blank": null,
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
+        "punctuations": "\u2014!'(),-.:;?\u00bf ",
+        "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
+        "is_unique": true,
+        "is_sorted": true
+    },
+    "add_blank": true,
+    "batch_group_size": 48,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": Infinity,
+    "compute_f0": false,
+    "compute_energy": false,
+    "compute_linear_spec": true,
+    "precompute_num_workers": 12,
+    "start_by_longest": true,
+    "shuffle": false,
+    "drop_last": false,
+    "datasets": [
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpb.csv",
+            "ignored_speakers": null,
+            "language": "brpb",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brba.csv",
+            "ignored_speakers": null,
+            "language": "brba",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brportugal.csv",
+            "ignored_speakers": null,
+            "language": "brportugal",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brsp.csv",
+            "ignored_speakers": null,
+            "language": "brsp",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpe.csv",
+            "ignored_speakers": null,
+            "language": "brpe",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brmg.csv",
+            "ignored_speakers": null,
+            "language": "brmg",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brrj.csv",
+            "ignored_speakers": null,
+            "language": "brrj",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brce.csv",
+            "ignored_speakers": null,
+            "language": "brce",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brrs.csv",
+            "ignored_speakers": null,
+            "language": "brrs",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_bralemanha.csv",
+            "ignored_speakers": null,
+            "language": "bralemanha",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brgo.csv",
+            "ignored_speakers": null,
+            "language": "brgo",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_bral.csv",
+            "ignored_speakers": null,
+            "language": "bral",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpr.csv",
+            "ignored_speakers": null,
+            "language": "brpr",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [
+        [
+            "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
+            "EDILEINE_FONSECA",
+            null,
+            "brsp"
+        ],
+        [
+            "Quem semeia ventos, colhe tempestades.",
+            "JOSE_PAULO_DE_ARAUJO",
+            null,
+            "brpb"
+        ],
+        [
+            "O olho do dono \u00e9 que engorda o gado.",
+            "VITOR_RAFAEL_OLIVEIRA_ALVES",
+            null,
+            "brba"
+        ],
+        [
+            "\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
+            "MARIA_AURORA_FELIX",
+            null,
+            "brportugal"
+        ],
+        [
+            "Quem espera sempre alcan\u00e7a.",
+            "ANTONIO_DE_AMORIM_COSTA",
+            null,
+            "brpe"
+        ],
+        [
+            "Cada macaco no seu galho.",
+            "ALCIDES_DE_LIMA",
+            null,
+            "brmg"
+        ],
+        [
+            "Em terra de cego, quem tem um olho \u00e9 rei.",
+            "ALUISIO_SOARES_DE_SOUSA",
+            null,
+            "brrj"
+        ],
+        [
+            "A ocasi\u00e3o faz o ladr\u00e3o.",
+            "FRANCISCO_JOSE_MOREIRA_MOTA",
+            null,
+            "brce"
+        ],
+        [
+            "De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
+            "EVALDO_ANDRADA_CORREA",
+            null,
+            "brrs"
+        ],
+        [
+            "Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
+            "DORIS_ALEXANDER",
+            null,
+            "bralemanha"
+        ],
+        [
+            "Quem n\u00e3o arrisca, n\u00e3o petisca.",
+            "DONALDO_LUIZ_DE_ALMEIDA",
+            null,
+            "brgo"
+        ],
+        [
+            "A uni\u00e3o faz a for\u00e7a.",
+            "GERONCIO_HENRIQUE_NETO",
+            null,
+            "bral"
+        ],
+        [
+            "Em boca fechada n\u00e3o entra mosquito.",
+            "MALU_NATEL_FREIRE_WEBER",
+            null,
+            "brpr"
+        ]
+    ],
+    "eval_split_max_size": 256,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "model_args": {
+        "num_chars": 266,
+        "out_channels": 513,
+        "spec_segment_size": 62,
+        "hidden_channels": 192,
+        "use_adaptive_weight_text_encoder": false,
+        "use_perfect_class_batch_sampler": true,
+        "perfect_class_batch_sampler_key": "language",
+        "hidden_channels_ffn_text_encoder": 768,
+        "num_heads_text_encoder": 2,
+        "num_layers_text_encoder": 10,
+        "kernel_size_text_encoder": 3,
+        "dropout_p_text_encoder": 0.1,
+        "dropout_p_duration_predictor": 0.5,
+        "kernel_size_posterior_encoder": 5,
+        "dilation_rate_posterior_encoder": 1,
+        "num_layers_posterior_encoder": 16,
+        "kernel_size_flow": 5,
+        "dilation_rate_flow": 1,
+        "num_layers_flow": 4,
+        "resblock_type_decoder": "2",
+        "resblock_kernel_sizes_decoder": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes_decoder": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates_decoder": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel_decoder": 512,
+        "upsample_kernel_sizes_decoder": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "periods_multi_period_discriminator": [
+            2,
+            3,
+            5,
+            7,
+            11
+        ],
+        "use_sdp": true,
+        "noise_scale": 1.0,
+        "inference_noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_scale_dp": 1.0,
+        "inference_noise_scale_dp": 1.0,
+        "max_inference_len": null,
+        "init_discriminator": true,
+        "use_spectral_norm_disriminator": false,
+        "use_speaker_embedding": false,
+        "num_speakers": 0,
+        "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth",
+        "d_vector_file": [
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
+        ],
+        "speaker_embedding_channels": 256,
+        "use_d_vector_file": true,
+        "d_vector_dim": 512,
+        "detach_dp_input": true,
+        "use_language_embedding": true,
+        "embedded_language_dim": 4,
+        "num_languages": 0,
+        "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json",
+        "use_speaker_encoder_as_loss": false,
+        "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+        "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+        "condition_dp_on_speaker": true,
+        "freeze_encoder": false,
+        "freeze_DP": false,
+        "freeze_PE": false,
+        "freeze_flow_decoder": false,
+        "freeze_waveform_decoder": false,
+        "encoder_sample_rate": null,
+        "interpolate_z": true,
+        "reinit_DP": false,
+        "reinit_text_encoder": false
+    },
+    "lr_gen": 0.0002,
+    "lr_disc": 0.0002,
+    "lr_scheduler_gen": "ExponentialLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "lr_scheduler_disc": "ExponentialLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "kl_loss_alpha": 1.0,
+    "disc_loss_alpha": 1.0,
+    "gen_loss_alpha": 1.0,
+    "feat_loss_alpha": 1.0,
+    "mel_loss_alpha": 45.0,
+    "dur_loss_alpha": 1.0,
+    "speaker_encoder_loss_alpha": 9.0,
+    "return_wav": true,
+    "use_weighted_sampler": true,
+    "weighted_sampler_attrs": {
+        "language": 1.0
+    },
+    "weighted_sampler_multipliers": {},
+    "r": 1,
+    "num_speakers": 0,
+    "use_speaker_embedding": false,
+    "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth",
+    "speaker_embedding_channels": 256,
+    "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json",
+    "use_language_embedding": true,
+    "use_d_vector_file": true,
+    "d_vector_file": [
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
+    ],
+    "d_vector_dim": 512
+}

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/language_ids.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "bral": 0,
+    "bralemanha": 1,
+    "brba": 2,
+    "brce": 3,
+    "brgo": 4,
+    "brmg": 5,
+    "brpb": 6,
+    "brpe": 7,
+    "brportugal": 8,
+    "brpr": 9,
+    "brrj": 10,
+    "brrs": 11,
+    "brsp": 12
+}

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/speakers.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
+size 3296

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/train_syntacc_baseline.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+import torch
+from trainer import Trainer, TrainerArgs
+from TTS.bin.compute_embeddings import compute_embeddings
+from TTS.bin.resample import resample_files
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.configs.vits_config import VitsConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
+from TTS.utils.downloaders import download_libri_tts
+from torch.utils.data import DataLoader
+from TTS.utils.samplers import PerfectBatchSampler
+torch.set_num_threads(24)
+# pylint: disable=W0105
+"""
+    This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
+    YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
+"""
+CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
+# Name of the run for the Trainer
+RUN_NAME = "YourTTS-Baseline-PT"
+# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
+OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs")  # "/raid/coqui/Checkpoints/original-YourTTS/"
+# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
+RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth"  # Download the checkpoint here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
+# This paramter is useful to debug, it skips the training epochs and just do the evaluation  and produce the test sentences
+SKIP_TRAIN_EPOCH = False
+# Set here the batch size to be used in training and evaluation
+BATCH_SIZE = 26
+# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
+# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
+SAMPLE_RATE = 16000
+DASHBOARD_LOGGER="tensorboard"
+LOGGER_URI = None
+DASHBOARD_LOGGER = "clearml"
+LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
+# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
+MAX_AUDIO_LEN_IN_SECONDS = float("inf")
+# Define here the datasets config
+brpb_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpb.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpb"
+)
+brba_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brba.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brba"
+)
+brportugal_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brportugal.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brportugal"
+)
+brsp_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brsp.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brsp"
+)
+brpe_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpe.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpe"
+)
+brmg_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brmg.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brmg"
+)
+brrj_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brrj.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brrj"
+)
+brce_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brce.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brce"
+)
+brrs_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brrs.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brrs"
+)
+bralemanha_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bralemanha.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bralemanha"
+)
+brgo_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brgo.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brgo"
+)
+bral_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bral.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bral"
+)
+brpr_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpr.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpr"
+)
+bres_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bres.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bres"
+)
+brpi_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpi.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpi"
+)
+# bres_train_config, brpi_train_config  no files found
+DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
+### Extract speaker embeddings
+SPEAKER_ENCODER_CHECKPOINT_PATH = (
+    "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
+)
+SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
+D_VECTOR_FILES = []  # List of speaker embeddings/d-vectors to be used during the training
+# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
+for dataset_conf in DATASETS_CONFIG_LIST:
+    # Check if the embeddings weren't already computed, if not compute it
+    embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
+    if not os.path.isfile(embeddings_file):
+        print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
+        compute_embeddings(
+            SPEAKER_ENCODER_CHECKPOINT_PATH,
+            SPEAKER_ENCODER_CONFIG_PATH,
+            embeddings_file,
+            old_speakers_file=None,
+            config_dataset_path=None,
+            formatter_name=dataset_conf.formatter,
+            dataset_name=dataset_conf.dataset_name,
+            dataset_path=dataset_conf.path,
+            meta_file_train=dataset_conf.meta_file_train,
+            meta_file_val=dataset_conf.meta_file_val,
+            disable_cuda=False,
+            no_eval=False,
+        )
+    D_VECTOR_FILES.append(embeddings_file)
+# Audio config used in training.
+audio_config = VitsAudioConfig(
+    sample_rate=SAMPLE_RATE,
+    hop_length=256,
+    win_length=1024,
+    fft_size=1024,
+    mel_fmin=0.0,
+    mel_fmax=None,
+    num_mels=80,
+)
+# Init VITSArgs setting the arguments that are needed for the YourTTS model
+model_args = VitsArgs(
+    spec_segment_size=62,
+    hidden_channels=192,
+    hidden_channels_ffn_text_encoder=768,
+    num_heads_text_encoder=2,
+    num_layers_text_encoder=10,
+    kernel_size_text_encoder=3,
+    dropout_p_text_encoder=0.1,
+    d_vector_file=D_VECTOR_FILES,
+    use_d_vector_file=True,
+    d_vector_dim=512,
+    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
+    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
+    resblock_type_decoder="2",  # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
+    # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
+    use_speaker_encoder_as_loss=False,
+    # Useful parameters to enable multilingual training
+    use_language_embedding=True,
+    embedded_language_dim=4,
+    use_adaptive_weight_text_encoder=False,
+    use_perfect_class_batch_sampler=True,
+    perfect_class_batch_sampler_key="language"
+)
+# General training config, here you can change the batch size and others useful parameters
+config = VitsConfig(
+    output_path=OUT_PATH,
+    model_args=model_args,
+    run_name=RUN_NAME,
+    project_name="SYNTACC",
+    run_description="""
+            - YourTTS with SYNTACC text encoder
+        """,
+    dashboard_logger=DASHBOARD_LOGGER,
+    logger_uri=LOGGER_URI,
+    audio=audio_config,
+    batch_size=BATCH_SIZE,
+    batch_group_size=48,
+    eval_batch_size=BATCH_SIZE,
+    num_loader_workers=8,
+    eval_split_max_size=256,
+    print_step=50,
+    plot_step=100,
+    log_model_step=1000,
+    save_step=5000,
+    save_n_checkpoints=2,
+    save_checkpoints=True,
+    # target_loss="loss_1",
+    print_eval=False,
+    use_phonemes=False,
+    phonemizer="espeak",
+    phoneme_language="en",
+    compute_input_seq_cache=True,
+    add_blank=True,
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class="TTS.tts.models.vits.VitsCharacters",
+        pad="_",
+        eos="&",
+        bos="*",
+        blank=None,
+        characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
+        punctuations="\u2014!'(),-.:;?\u00bf ",
+        phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
+        is_unique=True,
+        is_sorted=True,
+    ),
+    phoneme_cache_path=None,
+    precompute_num_workers=12,
+    start_by_longest=True,
+    datasets=DATASETS_CONFIG_LIST,
+    cudnn_benchmark=False,
+    max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
+    mixed_precision=False,
+    test_sentences=[
+        #GUSTAVO: apenas pessoas do treino
+        ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
+        ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
+        ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
+        ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
+        ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
+        ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
+        ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
+        ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
+        ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
+        ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
+        ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
+        ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
+        ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
+        # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
+        # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
+    ],
+    # Enable the weighted sampler
+    use_weighted_sampler=True,
+    # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
+    # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
+    weighted_sampler_attrs={"language": 1.0},
+    weighted_sampler_multipliers={
+        # "speaker_name": {
+        # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
+        # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
+        # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
+        # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
+        # }
+    },
+    # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
+    speaker_encoder_loss_alpha=9.0,
+)
+# Load all the datasets samples and split traning and evaluation sets
+train_samples, eval_samples = load_tts_samples(
+    config.datasets,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+# Init the model
+model = Vits.init_from_config(config)
+# Init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
+    config,
+    output_path=OUT_PATH,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+)
+trainer.fit()

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+00PM-e3c7cbd05/trainer_0_log.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9eb020abfc0ef9798a6097596138d1567d58429ca6c2ce6e59b350acc5301cff
+size 1771305

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a4a050e0d7a9c6c302b70b3f59dc195b12ad8922988de81bae55cbc1a89b9c8
+size 347719275

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/best_model_85001.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a4a050e0d7a9c6c302b70b3f59dc195b12ad8922988de81bae55cbc1a89b9c8
+size 347719275

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/config.json ADDED Viewed

	@@ -0,0 +1,496 @@

+{
+    "output_path": "/raid/datasets/MUPE/Experiments/runs",
+    "logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
+    "run_name": "YourTTS-Baseline-PT",
+    "project_name": "SYNTACC",
+    "run_description": "\n            - YourTTS with SYNTACC text encoder\n        ",
+    "print_step": 50,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "clearml",
+    "save_on_interrupt": true,
+    "log_model_step": 1000,
+    "save_step": 5000,
+    "save_n_checkpoints": 2,
+    "save_checkpoints": true,
+    "save_all_best": false,
+    "save_best_after": 10000,
+    "target_loss": null,
+    "print_eval": false,
+    "test_delay_epochs": 0,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": false,
+    "precision": "fp16",
+    "epochs": 1000,
+    "batch_size": 26,
+    "eval_batch_size": 26,
+    "grad_clip": [
+        1000,
+        1000
+    ],
+    "scheduler_after_epoch": true,
+    "lr": 0.001,
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": null,
+    "lr_scheduler_params": {},
+    "use_grad_scaler": false,
+    "allow_tf32": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "vits",
+    "num_loader_workers": 8,
+    "num_eval_loader_workers": 0,
+    "use_noise_augment": false,
+    "audio": {
+        "fft_size": 1024,
+        "sample_rate": 16000,
+        "win_length": 1024,
+        "hop_length": 256,
+        "num_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "use_phonemes": false,
+    "phonemizer": "espeak",
+    "phoneme_language": "en",
+    "compute_input_seq_cache": true,
+    "text_cleaner": "multilingual_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": null,
+    "characters": {
+        "characters_class": "TTS.tts.models.vits.VitsCharacters",
+        "vocab_dict": null,
+        "pad": "_",
+        "eos": "&",
+        "bos": "*",
+        "blank": null,
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
+        "punctuations": "\u2014!'(),-.:;?\u00bf ",
+        "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
+        "is_unique": true,
+        "is_sorted": true
+    },
+    "add_blank": true,
+    "batch_group_size": 48,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": Infinity,
+    "compute_f0": false,
+    "compute_energy": false,
+    "compute_linear_spec": true,
+    "precompute_num_workers": 12,
+    "start_by_longest": true,
+    "shuffle": false,
+    "drop_last": false,
+    "datasets": [
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpb.csv",
+            "ignored_speakers": null,
+            "language": "brpb",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brba.csv",
+            "ignored_speakers": null,
+            "language": "brba",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brportugal.csv",
+            "ignored_speakers": null,
+            "language": "brportugal",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brsp.csv",
+            "ignored_speakers": null,
+            "language": "brsp",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpe.csv",
+            "ignored_speakers": null,
+            "language": "brpe",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brmg.csv",
+            "ignored_speakers": null,
+            "language": "brmg",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brrj.csv",
+            "ignored_speakers": null,
+            "language": "brrj",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brce.csv",
+            "ignored_speakers": null,
+            "language": "brce",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brrs.csv",
+            "ignored_speakers": null,
+            "language": "brrs",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_bralemanha.csv",
+            "ignored_speakers": null,
+            "language": "bralemanha",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brgo.csv",
+            "ignored_speakers": null,
+            "language": "brgo",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_bral.csv",
+            "ignored_speakers": null,
+            "language": "bral",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpr.csv",
+            "ignored_speakers": null,
+            "language": "brpr",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [
+        [
+            "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
+            "EDILEINE_FONSECA",
+            null,
+            "brsp"
+        ],
+        [
+            "Quem semeia ventos, colhe tempestades.",
+            "JOSE_PAULO_DE_ARAUJO",
+            null,
+            "brpb"
+        ],
+        [
+            "O olho do dono \u00e9 que engorda o gado.",
+            "VITOR_RAFAEL_OLIVEIRA_ALVES",
+            null,
+            "brba"
+        ],
+        [
+            "\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
+            "MARIA_AURORA_FELIX",
+            null,
+            "brportugal"
+        ],
+        [
+            "Quem espera sempre alcan\u00e7a.",
+            "ANTONIO_DE_AMORIM_COSTA",
+            null,
+            "brpe"
+        ],
+        [
+            "Cada macaco no seu galho.",
+            "ALCIDES_DE_LIMA",
+            null,
+            "brmg"
+        ],
+        [
+            "Em terra de cego, quem tem um olho \u00e9 rei.",
+            "ALUISIO_SOARES_DE_SOUSA",
+            null,
+            "brrj"
+        ],
+        [
+            "A ocasi\u00e3o faz o ladr\u00e3o.",
+            "FRANCISCO_JOSE_MOREIRA_MOTA",
+            null,
+            "brce"
+        ],
+        [
+            "De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
+            "EVALDO_ANDRADA_CORREA",
+            null,
+            "brrs"
+        ],
+        [
+            "Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
+            "DORIS_ALEXANDER",
+            null,
+            "bralemanha"
+        ],
+        [
+            "Quem n\u00e3o arrisca, n\u00e3o petisca.",
+            "DONALDO_LUIZ_DE_ALMEIDA",
+            null,
+            "brgo"
+        ],
+        [
+            "A uni\u00e3o faz a for\u00e7a.",
+            "GERONCIO_HENRIQUE_NETO",
+            null,
+            "bral"
+        ],
+        [
+            "Em boca fechada n\u00e3o entra mosquito.",
+            "MALU_NATEL_FREIRE_WEBER",
+            null,
+            "brpr"
+        ]
+    ],
+    "eval_split_max_size": 256,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "model_args": {
+        "num_chars": 266,
+        "out_channels": 513,
+        "spec_segment_size": 62,
+        "hidden_channels": 192,
+        "use_adaptive_weight_text_encoder": false,
+        "use_perfect_class_batch_sampler": true,
+        "perfect_class_batch_sampler_key": "language",
+        "hidden_channels_ffn_text_encoder": 768,
+        "num_heads_text_encoder": 2,
+        "num_layers_text_encoder": 10,
+        "kernel_size_text_encoder": 3,
+        "dropout_p_text_encoder": 0.1,
+        "dropout_p_duration_predictor": 0.5,
+        "kernel_size_posterior_encoder": 5,
+        "dilation_rate_posterior_encoder": 1,
+        "num_layers_posterior_encoder": 16,
+        "kernel_size_flow": 5,
+        "dilation_rate_flow": 1,
+        "num_layers_flow": 4,
+        "resblock_type_decoder": "2",
+        "resblock_kernel_sizes_decoder": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes_decoder": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates_decoder": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel_decoder": 512,
+        "upsample_kernel_sizes_decoder": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "periods_multi_period_discriminator": [
+            2,
+            3,
+            5,
+            7,
+            11
+        ],
+        "use_sdp": true,
+        "noise_scale": 1.0,
+        "inference_noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_scale_dp": 1.0,
+        "inference_noise_scale_dp": 1.0,
+        "max_inference_len": null,
+        "init_discriminator": true,
+        "use_spectral_norm_disriminator": false,
+        "use_speaker_embedding": false,
+        "num_speakers": 0,
+        "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth",
+        "d_vector_file": [
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
+        ],
+        "speaker_embedding_channels": 256,
+        "use_d_vector_file": true,
+        "d_vector_dim": 512,
+        "detach_dp_input": true,
+        "use_language_embedding": true,
+        "embedded_language_dim": 4,
+        "num_languages": 0,
+        "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json",
+        "use_speaker_encoder_as_loss": false,
+        "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+        "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+        "condition_dp_on_speaker": true,
+        "freeze_encoder": false,
+        "freeze_DP": false,
+        "freeze_PE": false,
+        "freeze_flow_decoder": false,
+        "freeze_waveform_decoder": false,
+        "encoder_sample_rate": null,
+        "interpolate_z": true,
+        "reinit_DP": false,
+        "reinit_text_encoder": false
+    },
+    "lr_gen": 0.0002,
+    "lr_disc": 0.0002,
+    "lr_scheduler_gen": "ExponentialLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "lr_scheduler_disc": "ExponentialLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "kl_loss_alpha": 1.0,
+    "disc_loss_alpha": 1.0,
+    "gen_loss_alpha": 1.0,
+    "feat_loss_alpha": 1.0,
+    "mel_loss_alpha": 45.0,
+    "dur_loss_alpha": 1.0,
+    "speaker_encoder_loss_alpha": 9.0,
+    "return_wav": true,
+    "use_weighted_sampler": true,
+    "weighted_sampler_attrs": {
+        "language": 1.0
+    },
+    "weighted_sampler_multipliers": {},
+    "r": 1,
+    "num_speakers": 0,
+    "use_speaker_embedding": false,
+    "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth",
+    "speaker_embedding_channels": 256,
+    "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json",
+    "use_language_embedding": true,
+    "use_d_vector_file": true,
+    "d_vector_file": [
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
+    ],
+    "d_vector_dim": 512
+}

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/language_ids.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "bral": 0,
+    "bralemanha": 1,
+    "brba": 2,
+    "brce": 3,
+    "brgo": 4,
+    "brmg": 5,
+    "brpb": 6,
+    "brpe": 7,
+    "brportugal": 8,
+    "brpr": 9,
+    "brrj": 10,
+    "brrs": 11,
+    "brsp": 12
+}

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/speakers.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
+size 3296

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/train_syntacc_baseline.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+import torch
+from trainer import Trainer, TrainerArgs
+from TTS.bin.compute_embeddings import compute_embeddings
+from TTS.bin.resample import resample_files
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.configs.vits_config import VitsConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
+from TTS.utils.downloaders import download_libri_tts
+from torch.utils.data import DataLoader
+from TTS.utils.samplers import PerfectBatchSampler
+torch.set_num_threads(24)
+# pylint: disable=W0105
+"""
+    This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
+    YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
+"""
+CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
+# Name of the run for the Trainer
+RUN_NAME = "YourTTS-Baseline-PT"
+# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
+OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs")  # "/raid/coqui/Checkpoints/original-YourTTS/"
+# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
+RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth"  # Download the checkpoint here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
+# This paramter is useful to debug, it skips the training epochs and just do the evaluation  and produce the test sentences
+SKIP_TRAIN_EPOCH = False
+# Set here the batch size to be used in training and evaluation
+BATCH_SIZE = 26
+# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
+# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
+SAMPLE_RATE = 16000
+DASHBOARD_LOGGER="tensorboard"
+LOGGER_URI = None
+DASHBOARD_LOGGER = "clearml"
+LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
+# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
+MAX_AUDIO_LEN_IN_SECONDS = float("inf")
+# Define here the datasets config
+brpb_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpb.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpb"
+)
+brba_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brba.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brba"
+)
+brportugal_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brportugal.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brportugal"
+)
+brsp_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brsp.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brsp"
+)
+brpe_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpe.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpe"
+)
+brmg_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brmg.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brmg"
+)
+brrj_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brrj.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brrj"
+)
+brce_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brce.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brce"
+)
+brrs_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brrs.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brrs"
+)
+bralemanha_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bralemanha.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bralemanha"
+)
+brgo_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brgo.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brgo"
+)
+bral_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bral.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bral"
+)
+brpr_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpr.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpr"
+)
+bres_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bres.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bres"
+)
+brpi_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpi.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpi"
+)
+# bres_train_config, brpi_train_config  no files found
+DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
+### Extract speaker embeddings
+SPEAKER_ENCODER_CHECKPOINT_PATH = (
+    "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
+)
+SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
+D_VECTOR_FILES = []  # List of speaker embeddings/d-vectors to be used during the training
+# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
+for dataset_conf in DATASETS_CONFIG_LIST:
+    # Check if the embeddings weren't already computed, if not compute it
+    embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
+    if not os.path.isfile(embeddings_file):
+        print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
+        compute_embeddings(
+            SPEAKER_ENCODER_CHECKPOINT_PATH,
+            SPEAKER_ENCODER_CONFIG_PATH,
+            embeddings_file,
+            old_speakers_file=None,
+            config_dataset_path=None,
+            formatter_name=dataset_conf.formatter,
+            dataset_name=dataset_conf.dataset_name,
+            dataset_path=dataset_conf.path,
+            meta_file_train=dataset_conf.meta_file_train,
+            meta_file_val=dataset_conf.meta_file_val,
+            disable_cuda=False,
+            no_eval=False,
+        )
+    D_VECTOR_FILES.append(embeddings_file)
+# Audio config used in training.
+audio_config = VitsAudioConfig(
+    sample_rate=SAMPLE_RATE,
+    hop_length=256,
+    win_length=1024,
+    fft_size=1024,
+    mel_fmin=0.0,
+    mel_fmax=None,
+    num_mels=80,
+)
+# Init VITSArgs setting the arguments that are needed for the YourTTS model
+model_args = VitsArgs(
+    spec_segment_size=62,
+    hidden_channels=192,
+    hidden_channels_ffn_text_encoder=768,
+    num_heads_text_encoder=2,
+    num_layers_text_encoder=10,
+    kernel_size_text_encoder=3,
+    dropout_p_text_encoder=0.1,
+    d_vector_file=D_VECTOR_FILES,
+    use_d_vector_file=True,
+    d_vector_dim=512,
+    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
+    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
+    resblock_type_decoder="2",  # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
+    # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
+    use_speaker_encoder_as_loss=False,
+    # Useful parameters to enable multilingual training
+    use_language_embedding=True,
+    embedded_language_dim=4,
+    use_adaptive_weight_text_encoder=False,
+    use_perfect_class_batch_sampler=True,
+    perfect_class_batch_sampler_key="language"
+)
+# General training config, here you can change the batch size and others useful parameters
+config = VitsConfig(
+    output_path=OUT_PATH,
+    model_args=model_args,
+    run_name=RUN_NAME,
+    project_name="SYNTACC",
+    run_description="""
+            - YourTTS with SYNTACC text encoder
+        """,
+    dashboard_logger=DASHBOARD_LOGGER,
+    logger_uri=LOGGER_URI,
+    audio=audio_config,
+    batch_size=BATCH_SIZE,
+    batch_group_size=48,
+    eval_batch_size=BATCH_SIZE,
+    num_loader_workers=8,
+    eval_split_max_size=256,
+    print_step=50,
+    plot_step=100,
+    log_model_step=1000,
+    save_step=5000,
+    save_n_checkpoints=2,
+    save_checkpoints=True,
+    # target_loss="loss_1",
+    print_eval=False,
+    use_phonemes=False,
+    phonemizer="espeak",
+    phoneme_language="en",
+    compute_input_seq_cache=True,
+    add_blank=True,
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class="TTS.tts.models.vits.VitsCharacters",
+        pad="_",
+        eos="&",
+        bos="*",
+        blank=None,
+        characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
+        punctuations="\u2014!'(),-.:;?\u00bf ",
+        phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
+        is_unique=True,
+        is_sorted=True,
+    ),
+    phoneme_cache_path=None,
+    precompute_num_workers=12,
+    start_by_longest=True,
+    datasets=DATASETS_CONFIG_LIST,
+    cudnn_benchmark=False,
+    max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
+    mixed_precision=False,
+    test_sentences=[
+        #GUSTAVO: apenas pessoas do treino
+        ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
+        ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
+        ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
+        ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
+        ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
+        ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
+        ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
+        ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
+        ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
+        ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
+        ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
+        ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
+        ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
+        # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
+        # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
+    ],
+    # Enable the weighted sampler
+    use_weighted_sampler=True,
+    # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
+    # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
+    weighted_sampler_attrs={"language": 1.0},
+    weighted_sampler_multipliers={
+        # "speaker_name": {
+        # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
+        # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
+        # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
+        # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
+        # }
+    },
+    # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
+    speaker_encoder_loss_alpha=9.0,
+)
+# Load all the datasets samples and split traning and evaluation sets
+train_samples, eval_samples = load_tts_samples(
+    config.datasets,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+# Init the model
+model = Vits.init_from_config(config)
+# Init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
+    config,
+    output_path=OUT_PATH,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+)
+trainer.fit()

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+04PM-2bc0892f9/trainer_0_log.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94c095ee47fd6e763ee0e129a7728cf80e5e4f21301e767ab0141c478d369b89
+size 128993

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a082ddde12d21020f66a70cf05a74826488d10008a8379b699458d92509e85d1
+size 1043216142

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/best_model_87192.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a082ddde12d21020f66a70cf05a74826488d10008a8379b699458d92509e85d1
+size 1043216142

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_130000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a584eb832a857f9a11180b34a84b81117d8690ed1e5fa39e4ff711cf6ffd7f7
+size 1043220766

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/checkpoint_135000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:367ac46477805942658a7a78e8cf473409537967f9382a46249a8d11521ed3f9
+size 1043220766

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/config.json ADDED Viewed

	@@ -0,0 +1,496 @@

+{
+    "output_path": "/raid/datasets/MUPE/Experiments/runs",
+    "logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
+    "run_name": "YourTTS-Baseline-PT",
+    "project_name": "SYNTACC",
+    "run_description": "\n            - YourTTS with SYNTACC text encoder\n        ",
+    "print_step": 50,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "clearml",
+    "save_on_interrupt": true,
+    "log_model_step": 1000,
+    "save_step": 5000,
+    "save_n_checkpoints": 2,
+    "save_checkpoints": true,
+    "save_all_best": false,
+    "save_best_after": 10000,
+    "target_loss": null,
+    "print_eval": false,
+    "test_delay_epochs": 0,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": false,
+    "precision": "fp16",
+    "epochs": 1000,
+    "batch_size": 26,
+    "eval_batch_size": 26,
+    "grad_clip": [
+        1000,
+        1000
+    ],
+    "scheduler_after_epoch": true,
+    "lr": 0.001,
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": null,
+    "lr_scheduler_params": {},
+    "use_grad_scaler": false,
+    "allow_tf32": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "vits",
+    "num_loader_workers": 8,
+    "num_eval_loader_workers": 0,
+    "use_noise_augment": false,
+    "audio": {
+        "fft_size": 1024,
+        "sample_rate": 16000,
+        "win_length": 1024,
+        "hop_length": 256,
+        "num_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "use_phonemes": false,
+    "phonemizer": "espeak",
+    "phoneme_language": "en",
+    "compute_input_seq_cache": true,
+    "text_cleaner": "multilingual_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": null,
+    "characters": {
+        "characters_class": "TTS.tts.models.vits.VitsCharacters",
+        "vocab_dict": null,
+        "pad": "_",
+        "eos": "&",
+        "bos": "*",
+        "blank": null,
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
+        "punctuations": "\u2014!'(),-.:;?\u00bf ",
+        "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
+        "is_unique": true,
+        "is_sorted": true
+    },
+    "add_blank": true,
+    "batch_group_size": 48,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": Infinity,
+    "compute_f0": false,
+    "compute_energy": false,
+    "compute_linear_spec": true,
+    "precompute_num_workers": 12,
+    "start_by_longest": true,
+    "shuffle": false,
+    "drop_last": false,
+    "datasets": [
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpb.csv",
+            "ignored_speakers": null,
+            "language": "brpb",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brba.csv",
+            "ignored_speakers": null,
+            "language": "brba",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brportugal.csv",
+            "ignored_speakers": null,
+            "language": "brportugal",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brsp.csv",
+            "ignored_speakers": null,
+            "language": "brsp",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpe.csv",
+            "ignored_speakers": null,
+            "language": "brpe",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brmg.csv",
+            "ignored_speakers": null,
+            "language": "brmg",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brrj.csv",
+            "ignored_speakers": null,
+            "language": "brrj",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brce.csv",
+            "ignored_speakers": null,
+            "language": "brce",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brrs.csv",
+            "ignored_speakers": null,
+            "language": "brrs",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_bralemanha.csv",
+            "ignored_speakers": null,
+            "language": "bralemanha",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brgo.csv",
+            "ignored_speakers": null,
+            "language": "brgo",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_bral.csv",
+            "ignored_speakers": null,
+            "language": "bral",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpr.csv",
+            "ignored_speakers": null,
+            "language": "brpr",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [
+        [
+            "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
+            "EDILEINE_FONSECA",
+            null,
+            "brsp"
+        ],
+        [
+            "Quem semeia ventos, colhe tempestades.",
+            "JOSE_PAULO_DE_ARAUJO",
+            null,
+            "brpb"
+        ],
+        [
+            "O olho do dono \u00e9 que engorda o gado.",
+            "VITOR_RAFAEL_OLIVEIRA_ALVES",
+            null,
+            "brba"
+        ],
+        [
+            "\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
+            "MARIA_AURORA_FELIX",
+            null,
+            "brportugal"
+        ],
+        [
+            "Quem espera sempre alcan\u00e7a.",
+            "ANTONIO_DE_AMORIM_COSTA",
+            null,
+            "brpe"
+        ],
+        [
+            "Cada macaco no seu galho.",
+            "ALCIDES_DE_LIMA",
+            null,
+            "brmg"
+        ],
+        [
+            "Em terra de cego, quem tem um olho \u00e9 rei.",
+            "ALUISIO_SOARES_DE_SOUSA",
+            null,
+            "brrj"
+        ],
+        [
+            "A ocasi\u00e3o faz o ladr\u00e3o.",
+            "FRANCISCO_JOSE_MOREIRA_MOTA",
+            null,
+            "brce"
+        ],
+        [
+            "De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
+            "EVALDO_ANDRADA_CORREA",
+            null,
+            "brrs"
+        ],
+        [
+            "Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
+            "DORIS_ALEXANDER",
+            null,
+            "bralemanha"
+        ],
+        [
+            "Quem n\u00e3o arrisca, n\u00e3o petisca.",
+            "DONALDO_LUIZ_DE_ALMEIDA",
+            null,
+            "brgo"
+        ],
+        [
+            "A uni\u00e3o faz a for\u00e7a.",
+            "GERONCIO_HENRIQUE_NETO",
+            null,
+            "bral"
+        ],
+        [
+            "Em boca fechada n\u00e3o entra mosquito.",
+            "MALU_NATEL_FREIRE_WEBER",
+            null,
+            "brpr"
+        ]
+    ],
+    "eval_split_max_size": 256,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "model_args": {
+        "num_chars": 266,
+        "out_channels": 513,
+        "spec_segment_size": 62,
+        "hidden_channels": 192,
+        "use_adaptive_weight_text_encoder": false,
+        "use_perfect_class_batch_sampler": true,
+        "perfect_class_batch_sampler_key": "language",
+        "hidden_channels_ffn_text_encoder": 768,
+        "num_heads_text_encoder": 2,
+        "num_layers_text_encoder": 10,
+        "kernel_size_text_encoder": 3,
+        "dropout_p_text_encoder": 0.1,
+        "dropout_p_duration_predictor": 0.5,
+        "kernel_size_posterior_encoder": 5,
+        "dilation_rate_posterior_encoder": 1,
+        "num_layers_posterior_encoder": 16,
+        "kernel_size_flow": 5,
+        "dilation_rate_flow": 1,
+        "num_layers_flow": 4,
+        "resblock_type_decoder": "2",
+        "resblock_kernel_sizes_decoder": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes_decoder": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates_decoder": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel_decoder": 512,
+        "upsample_kernel_sizes_decoder": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "periods_multi_period_discriminator": [
+            2,
+            3,
+            5,
+            7,
+            11
+        ],
+        "use_sdp": true,
+        "noise_scale": 1.0,
+        "inference_noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_scale_dp": 1.0,
+        "inference_noise_scale_dp": 1.0,
+        "max_inference_len": null,
+        "init_discriminator": true,
+        "use_spectral_norm_disriminator": false,
+        "use_speaker_embedding": false,
+        "num_speakers": 0,
+        "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth",
+        "d_vector_file": [
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
+        ],
+        "speaker_embedding_channels": 256,
+        "use_d_vector_file": true,
+        "d_vector_dim": 512,
+        "detach_dp_input": true,
+        "use_language_embedding": true,
+        "embedded_language_dim": 4,
+        "num_languages": 0,
+        "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json",
+        "use_speaker_encoder_as_loss": false,
+        "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+        "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+        "condition_dp_on_speaker": true,
+        "freeze_encoder": false,
+        "freeze_DP": false,
+        "freeze_PE": false,
+        "freeze_flow_decoder": false,
+        "freeze_waveform_decoder": false,
+        "encoder_sample_rate": null,
+        "interpolate_z": true,
+        "reinit_DP": false,
+        "reinit_text_encoder": false
+    },
+    "lr_gen": 0.0002,
+    "lr_disc": 0.0002,
+    "lr_scheduler_gen": "ExponentialLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "lr_scheduler_disc": "ExponentialLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "kl_loss_alpha": 1.0,
+    "disc_loss_alpha": 1.0,
+    "gen_loss_alpha": 1.0,
+    "feat_loss_alpha": 1.0,
+    "mel_loss_alpha": 45.0,
+    "dur_loss_alpha": 1.0,
+    "speaker_encoder_loss_alpha": 9.0,
+    "return_wav": true,
+    "use_weighted_sampler": true,
+    "weighted_sampler_attrs": {
+        "language": 1.0
+    },
+    "weighted_sampler_multipliers": {},
+    "r": 1,
+    "num_speakers": 0,
+    "use_speaker_embedding": false,
+    "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth",
+    "speaker_embedding_channels": 256,
+    "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json",
+    "use_language_embedding": true,
+    "use_d_vector_file": true,
+    "d_vector_file": [
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
+    ],
+    "d_vector_dim": 512
+}

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/language_ids.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "bral": 0,
+    "bralemanha": 1,
+    "brba": 2,
+    "brce": 3,
+    "brgo": 4,
+    "brmg": 5,
+    "brpb": 6,
+    "brpe": 7,
+    "brportugal": 8,
+    "brpr": 9,
+    "brrj": 10,
+    "brrs": 11,
+    "brsp": 12
+}

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/speakers.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
+size 3296

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/train_syntacc_baseline.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+import torch
+from trainer import Trainer, TrainerArgs
+from TTS.bin.compute_embeddings import compute_embeddings
+from TTS.bin.resample import resample_files
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.configs.vits_config import VitsConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
+from TTS.utils.downloaders import download_libri_tts
+from torch.utils.data import DataLoader
+from TTS.utils.samplers import PerfectBatchSampler
+torch.set_num_threads(24)
+# pylint: disable=W0105
+"""
+    This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
+    YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
+"""
+CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
+# Name of the run for the Trainer
+RUN_NAME = "YourTTS-Baseline-PT"
+# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
+OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs")  # "/raid/coqui/Checkpoints/original-YourTTS/"
+# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
+RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth"  # Download the checkpoint here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
+# This paramter is useful to debug, it skips the training epochs and just do the evaluation  and produce the test sentences
+SKIP_TRAIN_EPOCH = False
+# Set here the batch size to be used in training and evaluation
+BATCH_SIZE = 26
+# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
+# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
+SAMPLE_RATE = 16000
+DASHBOARD_LOGGER="tensorboard"
+LOGGER_URI = None
+DASHBOARD_LOGGER = "clearml"
+LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
+# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
+MAX_AUDIO_LEN_IN_SECONDS = float("inf")
+# Define here the datasets config
+brpb_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpb.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpb"
+)
+brba_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brba.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brba"
+)
+brportugal_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brportugal.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brportugal"
+)
+brsp_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brsp.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brsp"
+)
+brpe_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpe.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpe"
+)
+brmg_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brmg.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brmg"
+)
+brrj_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brrj.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brrj"
+)
+brce_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brce.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brce"
+)
+brrs_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brrs.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brrs"
+)
+bralemanha_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bralemanha.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bralemanha"
+)
+brgo_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brgo.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brgo"
+)
+bral_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bral.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bral"
+)
+brpr_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpr.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpr"
+)
+bres_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bres.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bres"
+)
+brpi_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpi.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpi"
+)
+# bres_train_config, brpi_train_config  no files found
+DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
+### Extract speaker embeddings
+SPEAKER_ENCODER_CHECKPOINT_PATH = (
+    "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
+)
+SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
+D_VECTOR_FILES = []  # List of speaker embeddings/d-vectors to be used during the training
+# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
+for dataset_conf in DATASETS_CONFIG_LIST:
+    # Check if the embeddings weren't already computed, if not compute it
+    embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
+    if not os.path.isfile(embeddings_file):
+        print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
+        compute_embeddings(
+            SPEAKER_ENCODER_CHECKPOINT_PATH,
+            SPEAKER_ENCODER_CONFIG_PATH,
+            embeddings_file,
+            old_speakers_file=None,
+            config_dataset_path=None,
+            formatter_name=dataset_conf.formatter,
+            dataset_name=dataset_conf.dataset_name,
+            dataset_path=dataset_conf.path,
+            meta_file_train=dataset_conf.meta_file_train,
+            meta_file_val=dataset_conf.meta_file_val,
+            disable_cuda=False,
+            no_eval=False,
+        )
+    D_VECTOR_FILES.append(embeddings_file)
+# Audio config used in training.
+audio_config = VitsAudioConfig(
+    sample_rate=SAMPLE_RATE,
+    hop_length=256,
+    win_length=1024,
+    fft_size=1024,
+    mel_fmin=0.0,
+    mel_fmax=None,
+    num_mels=80,
+)
+# Init VITSArgs setting the arguments that are needed for the YourTTS model
+model_args = VitsArgs(
+    spec_segment_size=62,
+    hidden_channels=192,
+    hidden_channels_ffn_text_encoder=768,
+    num_heads_text_encoder=2,
+    num_layers_text_encoder=10,
+    kernel_size_text_encoder=3,
+    dropout_p_text_encoder=0.1,
+    d_vector_file=D_VECTOR_FILES,
+    use_d_vector_file=True,
+    d_vector_dim=512,
+    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
+    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
+    resblock_type_decoder="2",  # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
+    # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
+    use_speaker_encoder_as_loss=False,
+    # Useful parameters to enable multilingual training
+    use_language_embedding=True,
+    embedded_language_dim=4,
+    use_adaptive_weight_text_encoder=False,
+    use_perfect_class_batch_sampler=True,
+    perfect_class_batch_sampler_key="language"
+)
+# General training config, here you can change the batch size and others useful parameters
+config = VitsConfig(
+    output_path=OUT_PATH,
+    model_args=model_args,
+    run_name=RUN_NAME,
+    project_name="SYNTACC",
+    run_description="""
+            - YourTTS with SYNTACC text encoder
+        """,
+    dashboard_logger=DASHBOARD_LOGGER,
+    logger_uri=LOGGER_URI,
+    audio=audio_config,
+    batch_size=BATCH_SIZE,
+    batch_group_size=48,
+    eval_batch_size=BATCH_SIZE,
+    num_loader_workers=8,
+    eval_split_max_size=256,
+    print_step=50,
+    plot_step=100,
+    log_model_step=1000,
+    save_step=5000,
+    save_n_checkpoints=2,
+    save_checkpoints=True,
+    # target_loss="loss_1",
+    print_eval=False,
+    use_phonemes=False,
+    phonemizer="espeak",
+    phoneme_language="en",
+    compute_input_seq_cache=True,
+    add_blank=True,
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class="TTS.tts.models.vits.VitsCharacters",
+        pad="_",
+        eos="&",
+        bos="*",
+        blank=None,
+        characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
+        punctuations="\u2014!'(),-.:;?\u00bf ",
+        phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
+        is_unique=True,
+        is_sorted=True,
+    ),
+    phoneme_cache_path=None,
+    precompute_num_workers=12,
+    start_by_longest=True,
+    datasets=DATASETS_CONFIG_LIST,
+    cudnn_benchmark=False,
+    max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
+    mixed_precision=False,
+    test_sentences=[
+        #GUSTAVO: apenas pessoas do treino
+        ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
+        ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
+        ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
+        ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
+        ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
+        ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
+        ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
+        ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
+        ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
+        ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
+        ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
+        ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
+        ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
+        # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
+        # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
+    ],
+    # Enable the weighted sampler
+    use_weighted_sampler=True,
+    # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
+    # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
+    weighted_sampler_attrs={"language": 1.0},
+    weighted_sampler_multipliers={
+        # "speaker_name": {
+        # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
+        # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
+        # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
+        # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
+        # }
+    },
+    # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
+    speaker_encoder_loss_alpha=9.0,
+)
+# Load all the datasets samples and split traning and evaluation sets
+train_samples, eval_samples = load_tts_samples(
+    config.datasets,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+# Init the model
+model = Vits.init_from_config(config)
+# Init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
+    config,
+    output_path=OUT_PATH,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+)
+trainer.fit()

Experiments/runs/YourTTS-Baseline-PT-January-27-2024_12+05PM-165973116/trainer_0_log.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ddf81cb4061c7e47bd824c3ebb109cc02bc31ab79ee21e4e69d60d32aca454b
+size 1794644

Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2cb1d72efa1724f811028b33a003492d486385a35846b2a09aae34ece757cbab
+size 1044057134

Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/best_model_78415.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2cb1d72efa1724f811028b33a003492d486385a35846b2a09aae34ece757cbab
+size 1044057134

Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_80000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5208e907be9e2db12b928d9c2b1abd4df0b757f34703f124db1a326449a882f2
+size 1044057198

Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f77b5c81d12b629d83cce93a9b0318eb1d41888e6e985706fa275841c92444d3
+size 1044057198

Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/config.json ADDED Viewed

	@@ -0,0 +1,496 @@

+{
+    "output_path": "/raid/datasets/MUPE/Experiments/runs",
+    "logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
+    "run_name": "YourTTS-Syntacc-PT",
+    "project_name": "SYNTACC",
+    "run_description": "\n            - YourTTS with SYNTACC text encoder\n        ",
+    "print_step": 50,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "clearml",
+    "save_on_interrupt": true,
+    "log_model_step": 1000,
+    "save_step": 5000,
+    "save_n_checkpoints": 2,
+    "save_checkpoints": true,
+    "save_all_best": false,
+    "save_best_after": 10000,
+    "target_loss": null,
+    "print_eval": false,
+    "test_delay_epochs": 0,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": false,
+    "precision": "fp16",
+    "epochs": 1000,
+    "batch_size": 26,
+    "eval_batch_size": 26,
+    "grad_clip": [
+        1000,
+        1000
+    ],
+    "scheduler_after_epoch": true,
+    "lr": 0.001,
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": null,
+    "lr_scheduler_params": {},
+    "use_grad_scaler": false,
+    "allow_tf32": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "vits",
+    "num_loader_workers": 8,
+    "num_eval_loader_workers": 0,
+    "use_noise_augment": false,
+    "audio": {
+        "fft_size": 1024,
+        "sample_rate": 16000,
+        "win_length": 1024,
+        "hop_length": 256,
+        "num_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "use_phonemes": false,
+    "phonemizer": "espeak",
+    "phoneme_language": "en",
+    "compute_input_seq_cache": true,
+    "text_cleaner": "multilingual_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": null,
+    "characters": {
+        "characters_class": "TTS.tts.models.vits.VitsCharacters",
+        "vocab_dict": null,
+        "pad": "_",
+        "eos": "&",
+        "bos": "*",
+        "blank": null,
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
+        "punctuations": "\u2014!'(),-.:;?\u00bf ",
+        "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
+        "is_unique": true,
+        "is_sorted": true
+    },
+    "add_blank": true,
+    "batch_group_size": 48,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": Infinity,
+    "compute_f0": false,
+    "compute_energy": false,
+    "compute_linear_spec": true,
+    "precompute_num_workers": 12,
+    "start_by_longest": true,
+    "shuffle": false,
+    "drop_last": false,
+    "datasets": [
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpb.csv",
+            "ignored_speakers": null,
+            "language": "brpb",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brba.csv",
+            "ignored_speakers": null,
+            "language": "brba",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brportugal.csv",
+            "ignored_speakers": null,
+            "language": "brportugal",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brsp.csv",
+            "ignored_speakers": null,
+            "language": "brsp",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpe.csv",
+            "ignored_speakers": null,
+            "language": "brpe",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brmg.csv",
+            "ignored_speakers": null,
+            "language": "brmg",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brrj.csv",
+            "ignored_speakers": null,
+            "language": "brrj",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brce.csv",
+            "ignored_speakers": null,
+            "language": "brce",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brrs.csv",
+            "ignored_speakers": null,
+            "language": "brrs",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_bralemanha.csv",
+            "ignored_speakers": null,
+            "language": "bralemanha",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brgo.csv",
+            "ignored_speakers": null,
+            "language": "brgo",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_bral.csv",
+            "ignored_speakers": null,
+            "language": "bral",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpr.csv",
+            "ignored_speakers": null,
+            "language": "brpr",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [
+        [
+            "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
+            "EDILEINE_FONSECA",
+            null,
+            "brsp"
+        ],
+        [
+            "Quem semeia ventos, colhe tempestades.",
+            "JOSE_PAULO_DE_ARAUJO",
+            null,
+            "brpb"
+        ],
+        [
+            "O olho do dono \u00e9 que engorda o gado.",
+            "VITOR_RAFAEL_OLIVEIRA_ALVES",
+            null,
+            "brba"
+        ],
+        [
+            "\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
+            "MARIA_AURORA_FELIX",
+            null,
+            "brportugal"
+        ],
+        [
+            "Quem espera sempre alcan\u00e7a.",
+            "ANTONIO_DE_AMORIM_COSTA",
+            null,
+            "brpe"
+        ],
+        [
+            "Cada macaco no seu galho.",
+            "ALCIDES_DE_LIMA",
+            null,
+            "brmg"
+        ],
+        [
+            "Em terra de cego, quem tem um olho \u00e9 rei.",
+            "ALUISIO_SOARES_DE_SOUSA",
+            null,
+            "brrj"
+        ],
+        [
+            "A ocasi\u00e3o faz o ladr\u00e3o.",
+            "FRANCISCO_JOSE_MOREIRA_MOTA",
+            null,
+            "brce"
+        ],
+        [
+            "De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
+            "EVALDO_ANDRADA_CORREA",
+            null,
+            "brrs"
+        ],
+        [
+            "Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
+            "DORIS_ALEXANDER",
+            null,
+            "bralemanha"
+        ],
+        [
+            "Quem n\u00e3o arrisca, n\u00e3o petisca.",
+            "DONALDO_LUIZ_DE_ALMEIDA",
+            null,
+            "brgo"
+        ],
+        [
+            "A uni\u00e3o faz a for\u00e7a.",
+            "GERONCIO_HENRIQUE_NETO",
+            null,
+            "bral"
+        ],
+        [
+            "Em boca fechada n\u00e3o entra mosquito.",
+            "MALU_NATEL_FREIRE_WEBER",
+            null,
+            "brpr"
+        ]
+    ],
+    "eval_split_max_size": 256,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "model_args": {
+        "num_chars": 266,
+        "out_channels": 513,
+        "spec_segment_size": 62,
+        "hidden_channels": 192,
+        "use_adaptive_weight_text_encoder": true,
+        "use_perfect_class_batch_sampler": true,
+        "perfect_class_batch_sampler_key": "language",
+        "hidden_channels_ffn_text_encoder": 768,
+        "num_heads_text_encoder": 2,
+        "num_layers_text_encoder": 10,
+        "kernel_size_text_encoder": 3,
+        "dropout_p_text_encoder": 0.1,
+        "dropout_p_duration_predictor": 0.5,
+        "kernel_size_posterior_encoder": 5,
+        "dilation_rate_posterior_encoder": 1,
+        "num_layers_posterior_encoder": 16,
+        "kernel_size_flow": 5,
+        "dilation_rate_flow": 1,
+        "num_layers_flow": 4,
+        "resblock_type_decoder": "2",
+        "resblock_kernel_sizes_decoder": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes_decoder": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates_decoder": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel_decoder": 512,
+        "upsample_kernel_sizes_decoder": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "periods_multi_period_discriminator": [
+            2,
+            3,
+            5,
+            7,
+            11
+        ],
+        "use_sdp": true,
+        "noise_scale": 1.0,
+        "inference_noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_scale_dp": 1.0,
+        "inference_noise_scale_dp": 1.0,
+        "max_inference_len": null,
+        "init_discriminator": true,
+        "use_spectral_norm_disriminator": false,
+        "use_speaker_embedding": false,
+        "num_speakers": 0,
+        "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/speakers.pth",
+        "d_vector_file": [
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
+        ],
+        "speaker_embedding_channels": 256,
+        "use_d_vector_file": true,
+        "d_vector_dim": 512,
+        "detach_dp_input": true,
+        "use_language_embedding": false,
+        "embedded_language_dim": 4,
+        "num_languages": 0,
+        "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/language_ids.json",
+        "use_speaker_encoder_as_loss": false,
+        "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+        "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+        "condition_dp_on_speaker": true,
+        "freeze_encoder": false,
+        "freeze_DP": false,
+        "freeze_PE": false,
+        "freeze_flow_decoder": false,
+        "freeze_waveform_decoder": false,
+        "encoder_sample_rate": null,
+        "interpolate_z": true,
+        "reinit_DP": false,
+        "reinit_text_encoder": false
+    },
+    "lr_gen": 0.0002,
+    "lr_disc": 0.0002,
+    "lr_scheduler_gen": "ExponentialLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "lr_scheduler_disc": "ExponentialLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "kl_loss_alpha": 1.0,
+    "disc_loss_alpha": 1.0,
+    "gen_loss_alpha": 1.0,
+    "feat_loss_alpha": 1.0,
+    "mel_loss_alpha": 45.0,
+    "dur_loss_alpha": 1.0,
+    "speaker_encoder_loss_alpha": 9.0,
+    "return_wav": true,
+    "use_weighted_sampler": true,
+    "weighted_sampler_attrs": {
+        "language": 1.0
+    },
+    "weighted_sampler_multipliers": {},
+    "r": 1,
+    "num_speakers": 0,
+    "use_speaker_embedding": false,
+    "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/speakers.pth",
+    "speaker_embedding_channels": 256,
+    "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/language_ids.json",
+    "use_language_embedding": false,
+    "use_d_vector_file": true,
+    "d_vector_file": [
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
+    ],
+    "d_vector_dim": 512
+}

Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/language_ids.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "bral": 0,
+    "bralemanha": 1,
+    "brba": 2,
+    "brce": 3,
+    "brgo": 4,
+    "brmg": 5,
+    "brpb": 6,
+    "brpe": 7,
+    "brportugal": 8,
+    "brpr": 9,
+    "brrj": 10,
+    "brrs": 11,
+    "brsp": 12
+}

Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/speakers.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
+size 3296

Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/train_syntacc.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+import torch
+from trainer import Trainer, TrainerArgs
+from TTS.bin.compute_embeddings import compute_embeddings
+from TTS.bin.resample import resample_files
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.configs.vits_config import VitsConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
+from TTS.utils.downloaders import download_libri_tts
+from torch.utils.data import DataLoader
+from TTS.utils.samplers import PerfectBatchSampler
+torch.set_num_threads(24)
+# pylint: disable=W0105
+"""
+    This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
+    YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
+"""
+CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
+# Name of the run for the Trainer
+RUN_NAME = "YourTTS-Syntacc-PT"
+# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
+OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs")  # "/raid/coqui/Checkpoints/original-YourTTS/"
+# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
+RESTORE_PATH = "/raid/edresson/dev/Paper/cml_tts/checkpoints_yourtts_cml_tts_dataset/best_model.pth"  # Download the checkpoint here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
+# This paramter is useful to debug, it skips the training epochs and just do the evaluation  and produce the test sentences
+SKIP_TRAIN_EPOCH = False
+# Set here the batch size to be used in training and evaluation
+BATCH_SIZE = 26
+# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
+# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
+SAMPLE_RATE = 16000
+DASHBOARD_LOGGER="tensorboard"
+LOGGER_URI = None
+DASHBOARD_LOGGER = "clearml"
+LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
+# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
+MAX_AUDIO_LEN_IN_SECONDS = float("inf")
+# Define here the datasets config
+brpb_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpb.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpb"
+)
+brba_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brba.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brba"
+)
+brportugal_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brportugal.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brportugal"
+)
+brsp_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brsp.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brsp"
+)
+brpe_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpe.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpe"
+)
+brmg_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brmg.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brmg"
+)
+brrj_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brrj.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brrj"
+)
+brce_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brce.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brce"
+)
+brrs_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brrs.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brrs"
+)
+bralemanha_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bralemanha.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bralemanha"
+)
+brgo_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brgo.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brgo"
+)
+bral_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bral.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bral"
+)
+brpr_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpr.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpr"
+)
+bres_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bres.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bres"
+)
+brpi_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpi.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpi"
+)
+# bres_train_config, brpi_train_config  no files found
+DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
+### Extract speaker embeddings
+SPEAKER_ENCODER_CHECKPOINT_PATH = (
+    "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
+)
+SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
+D_VECTOR_FILES = []  # List of speaker embeddings/d-vectors to be used during the training
+# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
+for dataset_conf in DATASETS_CONFIG_LIST:
+    # Check if the embeddings weren't already computed, if not compute it
+    embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
+    if not os.path.isfile(embeddings_file):
+        print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
+        compute_embeddings(
+            SPEAKER_ENCODER_CHECKPOINT_PATH,
+            SPEAKER_ENCODER_CONFIG_PATH,
+            embeddings_file,
+            old_speakers_file=None,
+            config_dataset_path=None,
+            formatter_name=dataset_conf.formatter,
+            dataset_name=dataset_conf.dataset_name,
+            dataset_path=dataset_conf.path,
+            meta_file_train=dataset_conf.meta_file_train,
+            meta_file_val=dataset_conf.meta_file_val,
+            disable_cuda=False,
+            no_eval=False,
+        )
+    D_VECTOR_FILES.append(embeddings_file)
+# Audio config used in training.
+audio_config = VitsAudioConfig(
+    sample_rate=SAMPLE_RATE,
+    hop_length=256,
+    win_length=1024,
+    fft_size=1024,
+    mel_fmin=0.0,
+    mel_fmax=None,
+    num_mels=80,
+)
+# Init VITSArgs setting the arguments that are needed for the YourTTS model
+model_args = VitsArgs(
+    spec_segment_size=62,
+    hidden_channels=192,
+    hidden_channels_ffn_text_encoder=768,
+    num_heads_text_encoder=2,
+    num_layers_text_encoder=10,
+    kernel_size_text_encoder=3,
+    dropout_p_text_encoder=0.1,
+    d_vector_file=D_VECTOR_FILES,
+    use_d_vector_file=True,
+    d_vector_dim=512,
+    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
+    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
+    resblock_type_decoder="2",  # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
+    # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
+    use_speaker_encoder_as_loss=False,
+    # Useful parameters to enable multilingual training
+    use_language_embedding=False,
+    embedded_language_dim=4,
+    use_adaptive_weight_text_encoder=True,
+    use_perfect_class_batch_sampler=True,
+    perfect_class_batch_sampler_key="language"
+)
+# General training config, here you can change the batch size and others useful parameters
+config = VitsConfig(
+    output_path=OUT_PATH,
+    model_args=model_args,
+    run_name=RUN_NAME,
+    project_name="SYNTACC",
+    run_description="""
+            - YourTTS with SYNTACC text encoder
+        """,
+    dashboard_logger=DASHBOARD_LOGGER,
+    logger_uri=LOGGER_URI,
+    audio=audio_config,
+    batch_size=BATCH_SIZE,
+    batch_group_size=48,
+    eval_batch_size=BATCH_SIZE,
+    num_loader_workers=8,
+    eval_split_max_size=256,
+    print_step=50,
+    plot_step=100,
+    log_model_step=1000,
+    save_step=5000,
+    save_n_checkpoints=2,
+    save_checkpoints=True,
+    # target_loss="loss_1",
+    print_eval=False,
+    use_phonemes=False,
+    phonemizer="espeak",
+    phoneme_language="en",
+    compute_input_seq_cache=True,
+    add_blank=True,
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class="TTS.tts.models.vits.VitsCharacters",
+        pad="_",
+        eos="&",
+        bos="*",
+        blank=None,
+        characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
+        punctuations="\u2014!'(),-.:;?\u00bf ",
+        phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
+        is_unique=True,
+        is_sorted=True,
+    ),
+    phoneme_cache_path=None,
+    precompute_num_workers=12,
+    start_by_longest=True,
+    datasets=DATASETS_CONFIG_LIST,
+    cudnn_benchmark=False,
+    max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
+    mixed_precision=False,
+    test_sentences=[
+        #GUSTAVO: apenas pessoas do treino
+        ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
+        ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
+        ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
+        ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
+        ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
+        ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
+        ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
+        ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
+        ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
+        ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
+        ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
+        ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
+        ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
+        # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
+        # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
+    ],
+    # Enable the weighted sampler
+    use_weighted_sampler=True,
+    # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
+    # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
+    weighted_sampler_attrs={"language": 1.0},
+    weighted_sampler_multipliers={
+        # "speaker_name": {
+        # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
+        # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
+        # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
+        # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
+        # }
+    },
+    # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
+    speaker_encoder_loss_alpha=9.0,
+)
+# Load all the datasets samples and split traning and evaluation sets
+train_samples, eval_samples = load_tts_samples(
+    config.datasets,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+# Init the model
+model = Vits.init_from_config(config)
+# Init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
+    config,
+    output_path=OUT_PATH,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+)
+trainer.fit()

Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/trainer_0_log.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92bf84450b9ef1865a5f553a00c5d3649069dd6b17b314e548a429a52a8a9f3f
+size 1423682

Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdcbc348b4c18b558e8d8b5409027adf5897da1fce86b72795aaaf3635d3cb90
+size 1044057262

Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/best_model_87818.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cdcbc348b4c18b558e8d8b5409027adf5897da1fce86b72795aaaf3635d3cb90
+size 1044057262

Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/config.json ADDED Viewed

	@@ -0,0 +1,496 @@

+{
+    "output_path": "/raid/datasets/MUPE/Experiments/runs",
+    "logger_uri": "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/",
+    "run_name": "YourTTS-Syntacc-PT",
+    "project_name": "SYNTACC",
+    "run_description": "\n            - YourTTS with SYNTACC text encoder\n        ",
+    "print_step": 50,
+    "plot_step": 100,
+    "model_param_stats": false,
+    "wandb_entity": null,
+    "dashboard_logger": "clearml",
+    "save_on_interrupt": true,
+    "log_model_step": 1000,
+    "save_step": 5000,
+    "save_n_checkpoints": 2,
+    "save_checkpoints": true,
+    "save_all_best": false,
+    "save_best_after": 10000,
+    "target_loss": null,
+    "print_eval": false,
+    "test_delay_epochs": 0,
+    "run_eval": true,
+    "run_eval_steps": null,
+    "distributed_backend": "nccl",
+    "distributed_url": "tcp://localhost:54321",
+    "mixed_precision": false,
+    "precision": "fp16",
+    "epochs": 1000,
+    "batch_size": 26,
+    "eval_batch_size": 26,
+    "grad_clip": [
+        1000,
+        1000
+    ],
+    "scheduler_after_epoch": true,
+    "lr": 0.001,
+    "optimizer": "AdamW",
+    "optimizer_params": {
+        "betas": [
+            0.8,
+            0.99
+        ],
+        "eps": 1e-09,
+        "weight_decay": 0.01
+    },
+    "lr_scheduler": null,
+    "lr_scheduler_params": {},
+    "use_grad_scaler": false,
+    "allow_tf32": false,
+    "cudnn_enable": true,
+    "cudnn_deterministic": false,
+    "cudnn_benchmark": false,
+    "training_seed": 54321,
+    "model": "vits",
+    "num_loader_workers": 8,
+    "num_eval_loader_workers": 0,
+    "use_noise_augment": false,
+    "audio": {
+        "fft_size": 1024,
+        "sample_rate": 16000,
+        "win_length": 1024,
+        "hop_length": 256,
+        "num_mels": 80,
+        "mel_fmin": 0.0,
+        "mel_fmax": null
+    },
+    "use_phonemes": false,
+    "phonemizer": "espeak",
+    "phoneme_language": "en",
+    "compute_input_seq_cache": true,
+    "text_cleaner": "multilingual_cleaners",
+    "enable_eos_bos_chars": false,
+    "test_sentences_file": "",
+    "phoneme_cache_path": null,
+    "characters": {
+        "characters_class": "TTS.tts.models.vits.VitsCharacters",
+        "vocab_dict": null,
+        "pad": "_",
+        "eos": "&",
+        "bos": "*",
+        "blank": null,
+        "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
+        "punctuations": "\u2014!'(),-.:;?\u00bf ",
+        "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
+        "is_unique": true,
+        "is_sorted": true
+    },
+    "add_blank": true,
+    "batch_group_size": 48,
+    "loss_masking": null,
+    "min_audio_len": 1,
+    "max_audio_len": Infinity,
+    "min_text_len": 1,
+    "max_text_len": Infinity,
+    "compute_f0": false,
+    "compute_energy": false,
+    "compute_linear_spec": true,
+    "precompute_num_workers": 12,
+    "start_by_longest": true,
+    "shuffle": false,
+    "drop_last": false,
+    "datasets": [
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpb.csv",
+            "ignored_speakers": null,
+            "language": "brpb",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brba.csv",
+            "ignored_speakers": null,
+            "language": "brba",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brportugal.csv",
+            "ignored_speakers": null,
+            "language": "brportugal",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brsp.csv",
+            "ignored_speakers": null,
+            "language": "brsp",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpe.csv",
+            "ignored_speakers": null,
+            "language": "brpe",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brmg.csv",
+            "ignored_speakers": null,
+            "language": "brmg",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brrj.csv",
+            "ignored_speakers": null,
+            "language": "brrj",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brce.csv",
+            "ignored_speakers": null,
+            "language": "brce",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brrs.csv",
+            "ignored_speakers": null,
+            "language": "brrs",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_bralemanha.csv",
+            "ignored_speakers": null,
+            "language": "bralemanha",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brgo.csv",
+            "ignored_speakers": null,
+            "language": "brgo",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_bral.csv",
+            "ignored_speakers": null,
+            "language": "bral",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        },
+        {
+            "formatter": "coqui",
+            "dataset_name": "mupe",
+            "path": "/raid/datasets/MUPE/dataset/mupe/",
+            "meta_file_train": "metadata_coqui_brpr.csv",
+            "ignored_speakers": null,
+            "language": "brpr",
+            "phonemizer": "",
+            "meta_file_val": "",
+            "meta_file_attn_mask": ""
+        }
+    ],
+    "test_sentences": [
+        [
+            "Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.",
+            "EDILEINE_FONSECA",
+            null,
+            "brsp"
+        ],
+        [
+            "Quem semeia ventos, colhe tempestades.",
+            "JOSE_PAULO_DE_ARAUJO",
+            null,
+            "brpb"
+        ],
+        [
+            "O olho do dono \u00e9 que engorda o gado.",
+            "VITOR_RAFAEL_OLIVEIRA_ALVES",
+            null,
+            "brba"
+        ],
+        [
+            "\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.",
+            "MARIA_AURORA_FELIX",
+            null,
+            "brportugal"
+        ],
+        [
+            "Quem espera sempre alcan\u00e7a.",
+            "ANTONIO_DE_AMORIM_COSTA",
+            null,
+            "brpe"
+        ],
+        [
+            "Cada macaco no seu galho.",
+            "ALCIDES_DE_LIMA",
+            null,
+            "brmg"
+        ],
+        [
+            "Em terra de cego, quem tem um olho \u00e9 rei.",
+            "ALUISIO_SOARES_DE_SOUSA",
+            null,
+            "brrj"
+        ],
+        [
+            "A ocasi\u00e3o faz o ladr\u00e3o.",
+            "FRANCISCO_JOSE_MOREIRA_MOTA",
+            null,
+            "brce"
+        ],
+        [
+            "De gr\u00e3o em gr\u00e3o, a galinha enche o papo.",
+            "EVALDO_ANDRADA_CORREA",
+            null,
+            "brrs"
+        ],
+        [
+            "Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.",
+            "DORIS_ALEXANDER",
+            null,
+            "bralemanha"
+        ],
+        [
+            "Quem n\u00e3o arrisca, n\u00e3o petisca.",
+            "DONALDO_LUIZ_DE_ALMEIDA",
+            null,
+            "brgo"
+        ],
+        [
+            "A uni\u00e3o faz a for\u00e7a.",
+            "GERONCIO_HENRIQUE_NETO",
+            null,
+            "bral"
+        ],
+        [
+            "Em boca fechada n\u00e3o entra mosquito.",
+            "MALU_NATEL_FREIRE_WEBER",
+            null,
+            "brpr"
+        ]
+    ],
+    "eval_split_max_size": 256,
+    "eval_split_size": 0.01,
+    "use_speaker_weighted_sampler": false,
+    "speaker_weighted_sampler_alpha": 1.0,
+    "use_language_weighted_sampler": false,
+    "language_weighted_sampler_alpha": 1.0,
+    "use_length_weighted_sampler": false,
+    "length_weighted_sampler_alpha": 1.0,
+    "model_args": {
+        "num_chars": 266,
+        "out_channels": 513,
+        "spec_segment_size": 62,
+        "hidden_channels": 192,
+        "use_adaptive_weight_text_encoder": true,
+        "use_perfect_class_batch_sampler": true,
+        "perfect_class_batch_sampler_key": "language",
+        "hidden_channels_ffn_text_encoder": 768,
+        "num_heads_text_encoder": 2,
+        "num_layers_text_encoder": 10,
+        "kernel_size_text_encoder": 3,
+        "dropout_p_text_encoder": 0.1,
+        "dropout_p_duration_predictor": 0.5,
+        "kernel_size_posterior_encoder": 5,
+        "dilation_rate_posterior_encoder": 1,
+        "num_layers_posterior_encoder": 16,
+        "kernel_size_flow": 5,
+        "dilation_rate_flow": 1,
+        "num_layers_flow": 4,
+        "resblock_type_decoder": "2",
+        "resblock_kernel_sizes_decoder": [
+            3,
+            7,
+            11
+        ],
+        "resblock_dilation_sizes_decoder": [
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ],
+            [
+                1,
+                3,
+                5
+            ]
+        ],
+        "upsample_rates_decoder": [
+            8,
+            8,
+            2,
+            2
+        ],
+        "upsample_initial_channel_decoder": 512,
+        "upsample_kernel_sizes_decoder": [
+            16,
+            16,
+            4,
+            4
+        ],
+        "periods_multi_period_discriminator": [
+            2,
+            3,
+            5,
+            7,
+            11
+        ],
+        "use_sdp": true,
+        "noise_scale": 1.0,
+        "inference_noise_scale": 0.667,
+        "length_scale": 1,
+        "noise_scale_dp": 1.0,
+        "inference_noise_scale_dp": 1.0,
+        "max_inference_len": null,
+        "init_discriminator": true,
+        "use_spectral_norm_disriminator": false,
+        "use_speaker_embedding": false,
+        "num_speakers": 0,
+        "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/speakers.pth",
+        "d_vector_file": [
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
+            "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
+        ],
+        "speaker_embedding_channels": 256,
+        "use_d_vector_file": true,
+        "d_vector_dim": 512,
+        "detach_dp_input": true,
+        "use_language_embedding": false,
+        "embedded_language_dim": 4,
+        "num_languages": 0,
+        "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/language_ids.json",
+        "use_speaker_encoder_as_loss": false,
+        "speaker_encoder_config_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
+        "speaker_encoder_model_path": "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
+        "condition_dp_on_speaker": true,
+        "freeze_encoder": false,
+        "freeze_DP": false,
+        "freeze_PE": false,
+        "freeze_flow_decoder": false,
+        "freeze_waveform_decoder": false,
+        "encoder_sample_rate": null,
+        "interpolate_z": true,
+        "reinit_DP": false,
+        "reinit_text_encoder": false
+    },
+    "lr_gen": 0.0002,
+    "lr_disc": 0.0002,
+    "lr_scheduler_gen": "ExponentialLR",
+    "lr_scheduler_gen_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "lr_scheduler_disc": "ExponentialLR",
+    "lr_scheduler_disc_params": {
+        "gamma": 0.999875,
+        "last_epoch": -1
+    },
+    "kl_loss_alpha": 1.0,
+    "disc_loss_alpha": 1.0,
+    "gen_loss_alpha": 1.0,
+    "feat_loss_alpha": 1.0,
+    "mel_loss_alpha": 45.0,
+    "dur_loss_alpha": 1.0,
+    "speaker_encoder_loss_alpha": 9.0,
+    "return_wav": true,
+    "use_weighted_sampler": true,
+    "weighted_sampler_attrs": {
+        "language": 1.0
+    },
+    "weighted_sampler_multipliers": {},
+    "r": 1,
+    "num_speakers": 0,
+    "use_speaker_embedding": false,
+    "speakers_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/speakers.pth",
+    "speaker_embedding_channels": 256,
+    "language_ids_file": "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/language_ids.json",
+    "use_language_embedding": false,
+    "use_d_vector_file": true,
+    "d_vector_file": [
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpb.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brba.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brportugal.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brsp.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpe.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brmg.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrj.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brce.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brrs.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bralemanha.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brgo.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_bral.pth",
+        "/raid/datasets/MUPE/dataset/mupe/H_ASP_speaker_embeddings_brpr.pth"
+    ],
+    "d_vector_dim": 512
+}

Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/language_ids.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "bral": 0,
+    "bralemanha": 1,
+    "brba": 2,
+    "brce": 3,
+    "brgo": 4,
+    "brmg": 5,
+    "brpb": 6,
+    "brpe": 7,
+    "brportugal": 8,
+    "brpr": 9,
+    "brrj": 10,
+    "brrs": 11,
+    "brsp": 12
+}

Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/speakers.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0b8d8013199105bfba41bbef0ac6c7fc44ecb3385a39980da80931496c039bf
+size 3296

Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/train_syntacc.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+import torch
+from trainer import Trainer, TrainerArgs
+from TTS.bin.compute_embeddings import compute_embeddings
+from TTS.bin.resample import resample_files
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.configs.vits_config import VitsConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
+from TTS.utils.downloaders import download_libri_tts
+from torch.utils.data import DataLoader
+from TTS.utils.samplers import PerfectBatchSampler
+torch.set_num_threads(24)
+# pylint: disable=W0105
+"""
+    This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
+    YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
+"""
+CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
+# Name of the run for the Trainer
+RUN_NAME = "YourTTS-Syntacc-PT"
+# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
+OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs")  # "/raid/coqui/Checkpoints/original-YourTTS/"
+# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
+RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth"  # Download the checkpoint here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
+# This paramter is useful to debug, it skips the training epochs and just do the evaluation  and produce the test sentences
+SKIP_TRAIN_EPOCH = False
+# Set here the batch size to be used in training and evaluation
+BATCH_SIZE = 26
+# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
+# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
+SAMPLE_RATE = 16000
+DASHBOARD_LOGGER="tensorboard"
+LOGGER_URI = None
+DASHBOARD_LOGGER = "clearml"
+LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
+# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
+MAX_AUDIO_LEN_IN_SECONDS = float("inf")
+# Define here the datasets config
+brpb_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpb.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpb"
+)
+brba_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brba.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brba"
+)
+brportugal_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brportugal.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brportugal"
+)
+brsp_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brsp.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brsp"
+)
+brpe_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpe.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpe"
+)
+brmg_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brmg.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brmg"
+)
+brrj_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brrj.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brrj"
+)
+brce_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brce.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brce"
+)
+brrs_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brrs.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brrs"
+)
+bralemanha_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bralemanha.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bralemanha"
+)
+brgo_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brgo.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brgo"
+)
+bral_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bral.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bral"
+)
+brpr_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpr.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpr"
+)
+bres_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bres.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bres"
+)
+brpi_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpi.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpi"
+)
+# bres_train_config, brpi_train_config  no files found
+DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
+### Extract speaker embeddings
+SPEAKER_ENCODER_CHECKPOINT_PATH = (
+    "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
+)
+SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
+D_VECTOR_FILES = []  # List of speaker embeddings/d-vectors to be used during the training
+# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
+for dataset_conf in DATASETS_CONFIG_LIST:
+    # Check if the embeddings weren't already computed, if not compute it
+    embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
+    if not os.path.isfile(embeddings_file):
+        print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
+        compute_embeddings(
+            SPEAKER_ENCODER_CHECKPOINT_PATH,
+            SPEAKER_ENCODER_CONFIG_PATH,
+            embeddings_file,
+            old_speakers_file=None,
+            config_dataset_path=None,
+            formatter_name=dataset_conf.formatter,
+            dataset_name=dataset_conf.dataset_name,
+            dataset_path=dataset_conf.path,
+            meta_file_train=dataset_conf.meta_file_train,
+            meta_file_val=dataset_conf.meta_file_val,
+            disable_cuda=False,
+            no_eval=False,
+        )
+    D_VECTOR_FILES.append(embeddings_file)
+# Audio config used in training.
+audio_config = VitsAudioConfig(
+    sample_rate=SAMPLE_RATE,
+    hop_length=256,
+    win_length=1024,
+    fft_size=1024,
+    mel_fmin=0.0,
+    mel_fmax=None,
+    num_mels=80,
+)
+# Init VITSArgs setting the arguments that are needed for the YourTTS model
+model_args = VitsArgs(
+    spec_segment_size=62,
+    hidden_channels=192,
+    hidden_channels_ffn_text_encoder=768,
+    num_heads_text_encoder=2,
+    num_layers_text_encoder=10,
+    kernel_size_text_encoder=3,
+    dropout_p_text_encoder=0.1,
+    d_vector_file=D_VECTOR_FILES,
+    use_d_vector_file=True,
+    d_vector_dim=512,
+    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
+    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
+    resblock_type_decoder="2",  # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
+    # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
+    use_speaker_encoder_as_loss=False,
+    # Useful parameters to enable multilingual training
+    use_language_embedding=False,
+    embedded_language_dim=4,
+    use_adaptive_weight_text_encoder=True,
+    use_perfect_class_batch_sampler=True,
+    perfect_class_batch_sampler_key="language"
+)
+# General training config, here you can change the batch size and others useful parameters
+config = VitsConfig(
+    output_path=OUT_PATH,
+    model_args=model_args,
+    run_name=RUN_NAME,
+    project_name="SYNTACC",
+    run_description="""
+            - YourTTS with SYNTACC text encoder
+        """,
+    dashboard_logger=DASHBOARD_LOGGER,
+    logger_uri=LOGGER_URI,
+    audio=audio_config,
+    batch_size=BATCH_SIZE,
+    batch_group_size=48,
+    eval_batch_size=BATCH_SIZE,
+    num_loader_workers=8,
+    eval_split_max_size=256,
+    print_step=50,
+    plot_step=100,
+    log_model_step=1000,
+    save_step=5000,
+    save_n_checkpoints=2,
+    save_checkpoints=True,
+    # target_loss="loss_1",
+    print_eval=False,
+    use_phonemes=False,
+    phonemizer="espeak",
+    phoneme_language="en",
+    compute_input_seq_cache=True,
+    add_blank=True,
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class="TTS.tts.models.vits.VitsCharacters",
+        pad="_",
+        eos="&",
+        bos="*",
+        blank=None,
+        characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
+        punctuations="\u2014!'(),-.:;?\u00bf ",
+        phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
+        is_unique=True,
+        is_sorted=True,
+    ),
+    phoneme_cache_path=None,
+    precompute_num_workers=12,
+    start_by_longest=True,
+    datasets=DATASETS_CONFIG_LIST,
+    cudnn_benchmark=False,
+    max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
+    mixed_precision=False,
+    test_sentences=[
+        #GUSTAVO: apenas pessoas do treino
+        ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
+        ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
+        ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
+        ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
+        ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
+        ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
+        ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
+        ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
+        ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
+        ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
+        ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
+        ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
+        ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
+        # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
+        # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
+    ],
+    # Enable the weighted sampler
+    use_weighted_sampler=True,
+    # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
+    # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
+    weighted_sampler_attrs={"language": 1.0},
+    weighted_sampler_multipliers={
+        # "speaker_name": {
+        # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
+        # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
+        # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
+        # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
+        # }
+    },
+    # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
+    speaker_encoder_loss_alpha=9.0,
+)
+# Load all the datasets samples and split traning and evaluation sets
+train_samples, eval_samples = load_tts_samples(
+    config.datasets,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+# Init the model
+model = Vits.init_from_config(config)
+# Init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
+    config,
+    output_path=OUT_PATH,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+)
+trainer.fit()

Experiments/runs/YourTTS-Syntacc-PT-January-28-2024_09+50AM-5f5841de1/trainer_0_log.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc570d138b2c5d578e697b92e2d2d060c0945fcd0f880761a3fc800eaf619b6a
+size 97918

Experiments/train_syntacc.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import os
+import torch
+from trainer import Trainer, TrainerArgs
+from TTS.bin.compute_embeddings import compute_embeddings
+from TTS.bin.resample import resample_files
+from TTS.config.shared_configs import BaseDatasetConfig
+from TTS.tts.configs.vits_config import VitsConfig
+from TTS.tts.datasets import load_tts_samples
+from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig, VitsDataset
+from TTS.utils.downloaders import download_libri_tts
+from torch.utils.data import DataLoader
+from TTS.utils.samplers import PerfectBatchSampler
+torch.set_num_threads(24)
+# pylint: disable=W0105
+"""
+    This recipe replicates the first experiment proposed in the CML-TTS paper (https://arxiv.org/abs/2306.10097). It uses the YourTTS model.
+    YourTTS model is based on the VITS model however it uses external speaker embeddings extracted from a pre-trained speaker encoder and has small architecture changes.
+"""
+CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
+# Name of the run for the Trainer
+RUN_NAME = "YourTTS-Syntacc-PT"
+# Path where you want to save the models outputs (configs, checkpoints and tensorboard logs)
+OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "runs")  # "/raid/coqui/Checkpoints/original-YourTTS/"
+# If you want to do transfer learning and speedup your training you can set here the path to the CML-TTS available checkpoint that cam be downloaded here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
+RESTORE_PATH = "/raid/datasets/MUPE/Experiments/runs/YourTTS-Syntacc-PT-January-25-2024_02+59PM-0000000/checkpoint_85000.pth"  # Download the checkpoint here:  https://drive.google.com/u/2/uc?id=1yDCSJ1pFZQTHhL09GMbOrdjcPULApa0p
+# This paramter is useful to debug, it skips the training epochs and just do the evaluation  and produce the test sentences
+SKIP_TRAIN_EPOCH = False
+# Set here the batch size to be used in training and evaluation
+BATCH_SIZE = 26
+# Training Sampling rate and the target sampling rate for resampling the downloaded dataset (Note: If you change this you might need to redownload the dataset !!)
+# Note: If you add new datasets, please make sure that the dataset sampling rate and this parameter are matching, otherwise resample your audios
+SAMPLE_RATE = 16000
+DASHBOARD_LOGGER="tensorboard"
+LOGGER_URI = None
+DASHBOARD_LOGGER = "clearml"
+LOGGER_URI = "s3://coqui-ai-models/TTS/Checkpoints/YourTTS/MUPE/"
+# Max audio length in seconds to be used in training (every audio bigger than it will be ignored)
+MAX_AUDIO_LEN_IN_SECONDS = float("inf")
+# Define here the datasets config
+brpb_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpb.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpb"
+)
+brba_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brba.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brba"
+)
+brportugal_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brportugal.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brportugal"
+)
+brsp_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brsp.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brsp"
+)
+brpe_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpe.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpe"
+)
+brmg_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brmg.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brmg"
+)
+brrj_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brrj.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brrj"
+)
+brce_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brce.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brce"
+)
+brrs_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brrs.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brrs"
+)
+bralemanha_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bralemanha.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bralemanha"
+)
+brgo_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brgo.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brgo"
+)
+bral_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bral.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bral"
+)
+brpr_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpr.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpr"
+)
+bres_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_bres.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="bres"
+)
+brpi_train_config = BaseDatasetConfig(
+    formatter="coqui",
+    dataset_name="mupe",
+    meta_file_train="metadata_coqui_brpi.csv",
+    path="/raid/datasets/MUPE/dataset/mupe/",
+    language="brpi"
+)
+# bres_train_config, brpi_train_config  no files found
+DATASETS_CONFIG_LIST = [brpb_train_config,brba_train_config,brportugal_train_config,brsp_train_config,brpe_train_config,brmg_train_config,brrj_train_config,brce_train_config,brrs_train_config,bralemanha_train_config,brgo_train_config,bral_train_config,brpr_train_config]
+### Extract speaker embeddings
+SPEAKER_ENCODER_CHECKPOINT_PATH = (
+    "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
+)
+SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"
+D_VECTOR_FILES = []  # List of speaker embeddings/d-vectors to be used during the training
+# Iterates all the dataset configs checking if the speakers embeddings are already computated, if not compute it
+for dataset_conf in DATASETS_CONFIG_LIST:
+    # Check if the embeddings weren't already computed, if not compute it
+    embeddings_file = os.path.join(dataset_conf.path, f"H_ASP_speaker_embeddings_{dataset_conf.language}.pth")
+    if not os.path.isfile(embeddings_file):
+        print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
+        compute_embeddings(
+            SPEAKER_ENCODER_CHECKPOINT_PATH,
+            SPEAKER_ENCODER_CONFIG_PATH,
+            embeddings_file,
+            old_speakers_file=None,
+            config_dataset_path=None,
+            formatter_name=dataset_conf.formatter,
+            dataset_name=dataset_conf.dataset_name,
+            dataset_path=dataset_conf.path,
+            meta_file_train=dataset_conf.meta_file_train,
+            meta_file_val=dataset_conf.meta_file_val,
+            disable_cuda=False,
+            no_eval=False,
+        )
+    D_VECTOR_FILES.append(embeddings_file)
+# Audio config used in training.
+audio_config = VitsAudioConfig(
+    sample_rate=SAMPLE_RATE,
+    hop_length=256,
+    win_length=1024,
+    fft_size=1024,
+    mel_fmin=0.0,
+    mel_fmax=None,
+    num_mels=80,
+)
+# Init VITSArgs setting the arguments that are needed for the YourTTS model
+model_args = VitsArgs(
+    spec_segment_size=62,
+    hidden_channels=192,
+    hidden_channels_ffn_text_encoder=768,
+    num_heads_text_encoder=2,
+    num_layers_text_encoder=10,
+    kernel_size_text_encoder=3,
+    dropout_p_text_encoder=0.1,
+    d_vector_file=D_VECTOR_FILES,
+    use_d_vector_file=True,
+    d_vector_dim=512,
+    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
+    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
+    resblock_type_decoder="2",  # In the paper, we accidentally trained the YourTTS using ResNet blocks type 2, if you like you can use the ResNet blocks type 1 like the VITS model
+    # Useful parameters to enable the Speaker Consistency Loss (SCL) described in the paper
+    use_speaker_encoder_as_loss=False,
+    # Useful parameters to enable multilingual training
+    use_language_embedding=False,
+    embedded_language_dim=4,
+    use_adaptive_weight_text_encoder=True,
+    use_perfect_class_batch_sampler=True,
+    perfect_class_batch_sampler_key="language"
+)
+# General training config, here you can change the batch size and others useful parameters
+config = VitsConfig(
+    output_path=OUT_PATH,
+    model_args=model_args,
+    run_name=RUN_NAME,
+    project_name="SYNTACC",
+    run_description="""
+            - YourTTS with SYNTACC text encoder
+        """,
+    dashboard_logger=DASHBOARD_LOGGER,
+    logger_uri=LOGGER_URI,
+    audio=audio_config,
+    batch_size=BATCH_SIZE,
+    batch_group_size=48,
+    eval_batch_size=BATCH_SIZE,
+    num_loader_workers=8,
+    eval_split_max_size=256,
+    print_step=50,
+    plot_step=100,
+    log_model_step=1000,
+    save_step=5000,
+    save_n_checkpoints=2,
+    save_checkpoints=True,
+    # target_loss="loss_1",
+    print_eval=False,
+    use_phonemes=False,
+    phonemizer="espeak",
+    phoneme_language="en",
+    compute_input_seq_cache=True,
+    add_blank=True,
+    text_cleaner="multilingual_cleaners",
+    characters=CharactersConfig(
+        characters_class="TTS.tts.models.vits.VitsCharacters",
+        pad="_",
+        eos="&",
+        bos="*",
+        blank=None,
+        characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00a1\u00a3\u00b7\u00b8\u00c0\u00c1\u00c2\u00c3\u00c4\u00c5\u00c7\u00c8\u00c9\u00ca\u00cb\u00cc\u00cd\u00ce\u00cf\u00d1\u00d2\u00d3\u00d4\u00d5\u00d6\u00d9\u00da\u00db\u00dc\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e5\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u0101\u0104\u0105\u0106\u0107\u010b\u0119\u0141\u0142\u0143\u0144\u0152\u0153\u015a\u015b\u0161\u0178\u0179\u017a\u017b\u017c\u020e\u04e7\u05c2\u1b20",
+        punctuations="\u2014!'(),-.:;?\u00bf ",
+        phonemes="iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b'\u0303' ",
+        is_unique=True,
+        is_sorted=True,
+    ),
+    phoneme_cache_path=None,
+    precompute_num_workers=12,
+    start_by_longest=True,
+    datasets=DATASETS_CONFIG_LIST,
+    cudnn_benchmark=False,
+    max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
+    mixed_precision=False,
+    test_sentences=[
+        #GUSTAVO: apenas pessoas do treino
+        ["Voc\u00ea ter\u00e1 a vista do topo da montanha que voc\u00ea escalar.", "EDILEINE_FONSECA", None, "brsp"],
+        ["Quem semeia ventos, colhe tempestades.", "JOSE_PAULO_DE_ARAUJO", None, "brpb"],
+        ["O olho do dono \u00e9 que engorda o gado.", "VITOR_RAFAEL_OLIVEIRA_ALVES", None, "brba"],
+        ["\u00c1gua mole em pedra dura, tanto bate at\u00e9 que fura.", "MARIA_AURORA_FELIX", None, "brportugal"],
+        ["Quem espera sempre alcan\u00e7a.", "ANTONIO_DE_AMORIM_COSTA", None, "brpe"],
+        ["Cada macaco no seu galho.", "ALCIDES_DE_LIMA", None, "brmg"],
+        ["Em terra de cego, quem tem um olho \u00e9 rei.", "ALUISIO_SOARES_DE_SOUSA", None, "brrj"],
+        ["A ocasi\u00e3o faz o ladr\u00e3o.", "FRANCISCO_JOSE_MOREIRA_MOTA", None, "brce"],
+        ["De gr\u00e3o em gr\u00e3o, a galinha enche o papo.", "EVALDO_ANDRADA_CORREA", None, "brrs"],
+        ["Mais vale um p\u00c1ssaro na m\u00e3o do que dois voando.", "DORIS_ALEXANDER", None, "bralemanha"],
+        ["Quem n\u00e3o arrisca, n\u00e3o petisca.", "DONALDO_LUIZ_DE_ALMEIDA", None, "brgo"],
+        ["A uni\u00e3o faz a for\u00e7a.", "GERONCIO_HENRIQUE_NETO", None, "bral"],
+        ["Em boca fechada n\u00e3o entra mosquito.", "MALU_NATEL_FREIRE_WEBER", None, "brpr"],
+        # ["Quem n\u00e3o tem dinheiro, n\u00e3o tem v\u00edcios.", "INES_VIEIRA_BOGEA", None, "bres"],
+        # ["Quando voc\u00ea n\u00e3o corre nenhum risco, voc\u00ea arrisca tudo.", "MARIA_ASSUNCAO_SOUSA", None, "brpi"]
+    ],
+    # Enable the weighted sampler
+    use_weighted_sampler=True,
+    # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
+    # weighted_sampler_attrs={"language": 1.0, "speaker_name": 1.0},
+    weighted_sampler_attrs={"language": 1.0},
+    weighted_sampler_multipliers={
+        # "speaker_name": {
+        # you can force the batching scheme to give a higher weight to a certain speaker and then this speaker will appears more frequently on the batch.
+        # It will speedup the speaker adaptation process. Considering the CML train dataset and "new_speaker" as the speaker name of the speaker that you want to adapt.
+        # The line above will make the balancer consider the "new_speaker" as 106 speakers so 1/4 of the number of speakers present on CML dataset.
+        # 'new_speaker': 106, # (CML tot. train speaker)/4 = (424/4) = 106
+        # }
+    },
+    # It defines the Speaker Consistency Loss (SCL) α to 9 like the YourTTS paper
+    speaker_encoder_loss_alpha=9.0,
+)
+# Load all the datasets samples and split traning and evaluation sets
+train_samples, eval_samples = load_tts_samples(
+    config.datasets,
+    eval_split=True,
+    eval_split_max_size=config.eval_split_max_size,
+    eval_split_size=config.eval_split_size,
+)
+# Init the model
+model = Vits.init_from_config(config)
+# Init the trainer and 🚀
+trainer = Trainer(
+    TrainerArgs(restore_path=RESTORE_PATH, skip_train_epoch=SKIP_TRAIN_EPOCH, start_with_eval=True),
+    config,
+    output_path=OUT_PATH,
+    model=model,
+    train_samples=train_samples,
+    eval_samples=eval_samples,
+)
+trainer.fit()