NeonBohdan commited on
Commit
187dcb1
1 Parent(s): f8eac93

Updated configs

Browse files
Files changed (3) hide show
  1. config.json +136 -84
  2. language_ids.json +3 -0
  3. speaker_ids.json +4 -0
config.json CHANGED
@@ -1,28 +1,56 @@
1
  {
2
- "model": "vits",
 
3
  "run_name": "",
 
4
  "run_description": "",
5
- "epochs": 10000,
6
- "batch_size": null,
7
- "eval_batch_size": null,
8
- "mixed_precision": false,
9
- "scheduler_after_epoch": true,
10
- "run_eval": true,
11
- "test_delay_epochs": 0,
12
- "print_eval": false,
13
  "print_step": 25,
14
- "tb_plot_step": 100,
15
- "tb_model_param_stats": false,
 
 
 
16
  "save_step": 10000,
17
- "checkpoint": true,
18
- "keep_all_best": false,
19
- "keep_after": 10000,
20
- "num_loader_workers": null,
21
- "num_eval_loader_workers": 0,
22
- "use_noise_augment": false,
23
- "output_path": null,
 
24
  "distributed_backend": "nccl",
25
  "distributed_url": "tcp://localhost:54321",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  "audio": {
27
  "fft_size": 1024,
28
  "win_length": 1024,
@@ -35,17 +63,21 @@
35
  "preemphasis": 0.0,
36
  "ref_level_db": 20,
37
  "do_sound_norm": false,
38
- "log_func": "np.log10",
39
  "do_trim_silence": true,
40
  "trim_db": 45,
 
 
41
  "power": 1.5,
42
  "griffin_lim_iters": 60,
43
  "num_mels": 80,
44
  "mel_fmin": 0.0,
45
  "mel_fmax": null,
46
- "spec_gain": 20,
47
- "do_amp_to_db_linear": true,
48
  "do_amp_to_db_mel": true,
 
 
49
  "signal_norm": true,
50
  "min_level_db": -100,
51
  "symmetric_norm": true,
@@ -54,62 +86,65 @@
54
  "stats_path": null
55
  },
56
  "use_phonemes": true,
 
57
  "phoneme_language": "en",
58
- "compute_input_seq_cache": false,
59
  "text_cleaner": "phoneme_cleaners",
60
- "phonemizer": "espeak",
61
  "enable_eos_bos_chars": false,
62
  "test_sentences_file": "",
63
- "phoneme_cache_path": null,
64
- "characters":{
65
- "characters_class": "TTS.tts.models.vits.VitsCharacters",
66
- "pad": "_",
67
- "eos": "",
68
- "bos": "",
69
- "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
70
- "punctuations":";:,.!?¡¿—…\"«»“” ",
71
- "phonemes":"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
 
 
 
 
72
  },
 
73
  "batch_group_size": 0,
74
  "loss_masking": null,
75
- "min_seq_len": 13,
76
- "max_seq_len": 200,
 
 
 
77
  "compute_f0": false,
78
  "compute_linear_spec": true,
79
- "add_blank": true,
 
80
  "datasets": [
81
  {
82
- "name": "",
83
- "path": "",
84
- "meta_file_train": "",
85
- "ununsed_speakers": null,
 
86
  "meta_file_val": "",
87
  "meta_file_attn_mask": ""
88
  }
89
  ],
90
- "optimizer": "AdamW",
91
- "optimizer_params": {
92
- "betas": [
93
- 0.8,
94
- 0.99
95
- ],
96
- "eps": 1e-09,
97
- "weight_decay": 0.01
98
- },
99
- "lr_scheduler": "",
100
- "lr_scheduler_params": {},
101
  "test_sentences": [
102
- "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
103
- "Be a voice, not an echo.",
104
- "I'm sorry Dave. I'm afraid I can't do that.",
105
- "This cake is great. It's so delicious and moist.",
106
- "Prior to November 22, 1963."
 
107
  ],
108
- "use_speaker_embedding": false,
109
- "use_d_vector_file": false,
110
- "d_vector_dim": 0,
 
 
 
111
  "model_args": {
112
- "num_chars": 179,
113
  "out_channels": 513,
114
  "spec_segment_size": 32,
115
  "hidden_channels": 192,
@@ -118,69 +153,75 @@
118
  "num_layers_text_encoder": 6,
119
  "kernel_size_text_encoder": 3,
120
  "dropout_p_text_encoder": 0.1,
 
121
  "kernel_size_posterior_encoder": 5,
122
  "dilation_rate_posterior_encoder": 1,
123
  "num_layers_posterior_encoder": 16,
124
  "kernel_size_flow": 5,
125
  "dilation_rate_flow": 1,
126
  "num_layers_flow": 4,
127
- "resblock_type_decoder": "1",
128
  "resblock_kernel_sizes_decoder": [
129
  3,
130
- 7,
131
- 11
132
  ],
133
  "resblock_dilation_sizes_decoder": [
134
  [
135
  1,
136
- 3,
137
- 5
138
  ],
139
  [
140
- 1,
141
- 3,
142
- 5
143
  ],
144
  [
145
- 1,
146
  3,
147
- 5
148
  ]
149
  ],
150
  "upsample_rates_decoder": [
151
  8,
152
  8,
153
- 2,
154
- 2
155
  ],
156
- "upsample_initial_channel_decoder": 512,
157
  "upsample_kernel_sizes_decoder": [
158
  16,
159
  16,
160
- 4,
161
- 4
162
  ],
163
  "use_sdp": true,
164
  "noise_scale": 1.0,
165
- "inference_noise_scale": 0.667,
166
  "length_scale": 1,
167
  "noise_scale_dp": 1.0,
168
- "inference_noise_scale_dp": 0.8,
169
  "max_inference_len": null,
170
  "init_discriminator": false,
171
  "use_spectral_norm_disriminator": false,
172
- "use_speaker_embedding": false,
173
  "num_speakers": 0,
174
- "speakers_file": null,
 
175
  "speaker_embedding_channels": 256,
176
  "use_d_vector_file": false,
177
  "d_vector_dim": 0,
178
- "detach_dp_input": true
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  },
180
- "grad_clip": [
181
- 5,
182
- 5
183
- ],
184
  "lr_gen": 0.0002,
185
  "lr_disc": 0.0002,
186
  "lr_scheduler_gen": "ExponentialLR",
@@ -198,6 +239,17 @@
198
  "gen_loss_alpha": 1.0,
199
  "feat_loss_alpha": 1.0,
200
  "mel_loss_alpha": 45.0,
 
 
201
  "return_wav": true,
202
- "r": 1
 
 
 
 
 
 
 
 
 
203
  }
 
1
  {
2
+ "output_path": null,
3
+ "logger_uri": null,
4
  "run_name": "",
5
+ "project_name": null,
6
  "run_description": "",
 
 
 
 
 
 
 
 
7
  "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "log_model_step": null,
13
  "save_step": 10000,
14
+ "save_n_checkpoints": 5,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": false,
20
+ "test_delay_epochs": -1,
21
+ "run_eval": true,
22
  "distributed_backend": "nccl",
23
  "distributed_url": "tcp://localhost:54321",
24
+ "mixed_precision": true,
25
+ "epochs": 1000,
26
+ "batch_size": 32,
27
+ "eval_batch_size": 4,
28
+ "grad_clip": [
29
+ 1000,
30
+ 1000
31
+ ],
32
+ "scheduler_after_epoch": true,
33
+ "lr": 0.001,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.8,
38
+ 0.99
39
+ ],
40
+ "eps": 1e-09,
41
+ "weight_decay": 0.01
42
+ },
43
+ "lr_scheduler": "",
44
+ "lr_scheduler_params": {},
45
+ "use_grad_scaler": false,
46
+ "cudnn_enable": true,
47
+ "cudnn_deterministic": false,
48
+ "cudnn_benchmark": false,
49
+ "training_seed": 54321,
50
+ "model": "vits",
51
+ "num_loader_workers": 8,
52
+ "num_eval_loader_workers": 8,
53
+ "use_noise_augment": false,
54
  "audio": {
55
  "fft_size": 1024,
56
  "win_length": 1024,
 
63
  "preemphasis": 0.0,
64
  "ref_level_db": 20,
65
  "do_sound_norm": false,
66
+ "log_func": "np.log",
67
  "do_trim_silence": true,
68
  "trim_db": 45,
69
+ "do_rms_norm": false,
70
+ "db_level": null,
71
  "power": 1.5,
72
  "griffin_lim_iters": 60,
73
  "num_mels": 80,
74
  "mel_fmin": 0.0,
75
  "mel_fmax": null,
76
+ "spec_gain": 1.0,
77
+ "do_amp_to_db_linear": false,
78
  "do_amp_to_db_mel": true,
79
+ "pitch_fmax": 640.0,
80
+ "pitch_fmin": 0.0,
81
  "signal_norm": true,
82
  "min_level_db": -100,
83
  "symmetric_norm": true,
 
86
  "stats_path": null
87
  },
88
  "use_phonemes": true,
89
+ "phonemizer": "espeak",
90
  "phoneme_language": "en",
91
+ "compute_input_seq_cache": true,
92
  "text_cleaner": "phoneme_cleaners",
 
93
  "enable_eos_bos_chars": false,
94
  "test_sentences_file": "",
95
+ "phoneme_cache_path": "./logs/phoneme_cache",
96
+ "characters": {
97
+ "characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
98
+ "vocab_dict": null,
99
+ "pad": "<PAD>",
100
+ "eos": "<EOS>",
101
+ "bos": "<BOS>",
102
+ "blank": "<BLNK>",
103
+ "characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
104
+ "punctuations": "!'(),-.:;? ",
105
+ "phonemes": null,
106
+ "is_unique": false,
107
+ "is_sorted": true
108
  },
109
+ "add_blank": true,
110
  "batch_group_size": 0,
111
  "loss_masking": null,
112
+ "sort_by_audio_len": false,
113
+ "min_audio_len": 32768,
114
+ "max_audio_len": 132300,
115
+ "min_text_len": 1,
116
+ "max_text_len": Infinity,
117
  "compute_f0": false,
118
  "compute_linear_spec": true,
119
+ "precompute_num_workers": 12,
120
+ "start_by_longest": false,
121
  "datasets": [
122
  {
123
+ "name": "ljspeech",
124
+ "path": "./datasets/ljspeech/",
125
+ "meta_file_train": "metadata.csv",
126
+ "ignored_speakers": null,
127
+ "language": "en",
128
  "meta_file_val": "",
129
  "meta_file_attn_mask": ""
130
  }
131
  ],
 
 
 
 
 
 
 
 
 
 
 
132
  "test_sentences": [
133
+ [
134
+ "A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light.",
135
+ "ljspeech",
136
+ null,
137
+ "en"
138
+ ]
139
  ],
140
+ "eval_split_max_size": null,
141
+ "eval_split_size": 0.01,
142
+ "use_speaker_weighted_sampler": false,
143
+ "speaker_weighted_sampler_alpha": 1.0,
144
+ "use_language_weighted_sampler": true,
145
+ "language_weighted_sampler_alpha": 1.0,
146
  "model_args": {
147
+ "num_chars": 131,
148
  "out_channels": 513,
149
  "spec_segment_size": 32,
150
  "hidden_channels": 192,
 
153
  "num_layers_text_encoder": 6,
154
  "kernel_size_text_encoder": 3,
155
  "dropout_p_text_encoder": 0.1,
156
+ "dropout_p_duration_predictor": 0.5,
157
  "kernel_size_posterior_encoder": 5,
158
  "dilation_rate_posterior_encoder": 1,
159
  "num_layers_posterior_encoder": 16,
160
  "kernel_size_flow": 5,
161
  "dilation_rate_flow": 1,
162
  "num_layers_flow": 4,
163
+ "resblock_type_decoder": "2",
164
  "resblock_kernel_sizes_decoder": [
165
  3,
166
+ 5,
167
+ 7
168
  ],
169
  "resblock_dilation_sizes_decoder": [
170
  [
171
  1,
172
+ 2
 
173
  ],
174
  [
175
+ 2,
176
+ 6
 
177
  ],
178
  [
 
179
  3,
180
+ 12
181
  ]
182
  ],
183
  "upsample_rates_decoder": [
184
  8,
185
  8,
186
+ 4
 
187
  ],
188
+ "upsample_initial_channel_decoder": 256,
189
  "upsample_kernel_sizes_decoder": [
190
  16,
191
  16,
192
+ 8
 
193
  ],
194
  "use_sdp": true,
195
  "noise_scale": 1.0,
196
+ "inference_noise_scale": 0.3,
197
  "length_scale": 1,
198
  "noise_scale_dp": 1.0,
199
+ "inference_noise_scale_dp": 0.3,
200
  "max_inference_len": null,
201
  "init_discriminator": false,
202
  "use_spectral_norm_disriminator": false,
203
+ "use_speaker_embedding": true,
204
  "num_speakers": 0,
205
+ "speakers_file": "./logs/run-June-03-2022_10+09PM-f3efb56/speakers.json",
206
+ "d_vector_file": null,
207
  "speaker_embedding_channels": 256,
208
  "use_d_vector_file": false,
209
  "d_vector_dim": 0,
210
+ "detach_dp_input": true,
211
+ "use_language_embedding": true,
212
+ "embedded_language_dim": 4,
213
+ "num_languages": 1,
214
+ "language_ids_file": null,
215
+ "use_speaker_encoder_as_loss": false,
216
+ "speaker_encoder_config_path": "",
217
+ "speaker_encoder_model_path": "",
218
+ "condition_dp_on_speaker": true,
219
+ "freeze_encoder": false,
220
+ "freeze_DP": false,
221
+ "freeze_PE": false,
222
+ "freeze_flow_decoder": false,
223
+ "freeze_waveform_decoder": false
224
  },
 
 
 
 
225
  "lr_gen": 0.0002,
226
  "lr_disc": 0.0002,
227
  "lr_scheduler_gen": "ExponentialLR",
 
239
  "gen_loss_alpha": 1.0,
240
  "feat_loss_alpha": 1.0,
241
  "mel_loss_alpha": 45.0,
242
+ "dur_loss_alpha": 1.0,
243
+ "speaker_encoder_loss_alpha": 1.0,
244
  "return_wav": true,
245
+ "r": 1,
246
+ "num_speakers": 1,
247
+ "use_speaker_embedding": true,
248
+ "speakers_file": null,
249
+ "speaker_embedding_channels": 256,
250
+ "language_ids_file": null,
251
+ "use_language_embedding": true,
252
+ "use_d_vector_file": false,
253
+ "d_vector_file": null,
254
+ "d_vector_dim": 0
255
  }
language_ids.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "en": 0
3
+ }
speaker_ids.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "en": 0,
3
+ "ljspeech": 0
4
+ }