sulaimank commited on
Commit
65de50d
·
verified ·
1 Parent(s): b475b85

Update hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +42 -222
hyperparams.yaml CHANGED
@@ -1,125 +1,19 @@
1
- # Generated 2024-10-12 from:
2
- # /workspace/speechbrain/recipes/LJSpeech/TTS/tacotron2/hparams/train.yaml
3
- # yamllint disable
4
- ############################################################################
5
- # Model: Tacotron2
6
- # Tokens: Raw characters (English text)
7
- # losses: Transducer
8
- # Training: LJSpeech
9
- # Authors: Georges Abous-Rjeili, Artem Ploujnikov, Yingzhi Wang
10
- # ############################################################################
11
 
12
-
13
- ###################################
14
- # Experiment Parameters and setup #
15
- ###################################
16
- seed: 1234
17
- __set_seed: !apply:speechbrain.utils.seed_everything [1234]
18
- output_folder: ./results/tacotron2/1234
19
- save_folder: ./results/tacotron2/1234/save
20
- train_log: ./results/tacotron2/1234/train_log.txt
21
- epochs: 1000
22
- keep_checkpoint_interval: 100
23
- wandb_id: tacotron-sk
24
- wandb_user: sulaiman-kagumire
25
- wandb_project: tacotron2-lg-cv
26
- ###################################
27
- # Progress Samples #
28
- ###################################
29
- # Progress samples are used to monitor the progress
30
- # of an ongoing training session by outputting samples
31
- # of spectrograms, alignments, etc at regular intervals
32
-
33
- # Whether to enable progress samples
34
- progress_samples: false
35
-
36
- # The path where the samples will be stored
37
- progress_sample_path: ./results/tacotron2/1234/samples
38
- # The interval, in epochs. For instance, if it is set to 5,
39
- # progress samples will be output every 5 epochs
40
- progress_samples_interval: 100
41
- # The sample size for raw batch samples saved in batch.pth
42
- # (useful mostly for model debugging)
43
- progress_batch_sample_size: 3
44
-
45
- #################################
46
- # Data files and pre-processing #
47
- #################################
48
- data_folder: data
49
- # e.g, /localscratch/ljspeech
50
-
51
- train_json: ./results/tacotron2/1234/save/train.json
52
- valid_json: ./results/tacotron2/1234/save/valid.json
53
- test_json: ./results/tacotron2/1234/save/test.json
54
-
55
- splits: [train, valid]
56
- split_ratio: [90, 10]
57
-
58
- skip_prep: false
59
- init_from_pretrained: true
60
- # Use the original preprocessing from nvidia
61
- # The cleaners to be used (applicable to nvidia only)
62
- text_cleaners: [transliteration_cleaners]
63
-
64
- ################################
65
- # Audio Parameters #
66
- ################################
67
- sample_rate: 22050
68
- hop_length: 256
69
- win_length: 1024
70
  n_mel_channels: 80
71
- n_fft: 1024
72
- mel_fmin: 0.0
73
- mel_fmax: 8000.0
74
- mel_normalized: false
75
- power: 1
76
- norm: slaney
77
- mel_scale: slaney
78
- dynamic_range_compression: true
79
-
80
- ################################
81
- # Optimization Hyperparameters #
82
- ################################
83
- learning_rate: 0.001
84
- weight_decay: 0.000006
85
- batch_size: 256
86
- num_workers: 96
87
- mask_padding: true
88
- guided_attention_sigma: 0.2
89
- guided_attention_weight: 50.0
90
- guided_attention_weight_half_life: 10.
91
- guided_attention_hard_stop: 50
92
- gate_loss_weight: 1.0
93
-
94
- train_dataloader_opts:
95
- batch_size: 256
96
- drop_last: false #True #False
97
- num_workers: 96
98
- collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
99
-
100
- valid_dataloader_opts:
101
- batch_size: 256
102
- num_workers: 96
103
- collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
104
-
105
- test_dataloader_opts:
106
- batch_size: 256
107
- num_workers: 96
108
- collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
109
-
110
- ################################
111
- # Model Parameters and model #
112
- ################################
113
- n_symbols: 148 #fixed depending on symbols in textToSequence
114
  symbols_embedding_dim: 512
115
-
116
- # Encoder parameters
117
  encoder_kernel_size: 5
118
  encoder_n_convolutions: 3
119
  encoder_embedding_dim: 512
120
-
121
- # Decoder parameters
122
- # The number of frames in the target per encoder step
 
123
  n_frames_per_step: 1
124
  decoder_rnn_dim: 1024
125
  prenet_dim: 256
@@ -127,123 +21,49 @@ max_decoder_steps: 1000
127
  gate_threshold: 0.5
128
  p_attention_dropout: 0.1
129
  p_decoder_dropout: 0.1
130
- decoder_no_early_stopping: false
131
-
132
- # Attention parameters
133
- attention_rnn_dim: 1024
134
- attention_dim: 128
135
-
136
- # Location Layer parameters
137
- attention_location_n_filters: 32
138
- attention_location_kernel_size: 31
139
-
140
- # Mel-post processing network parameters
141
  postnet_embedding_dim: 512
142
  postnet_kernel_size: 5
143
  postnet_n_convolutions: 5
 
 
144
 
145
- mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram
146
- sample_rate: 22050
147
- hop_length: 256
148
- win_length: 1024
149
- n_fft: 1024
150
- n_mels: 80
151
- f_min: 0.0
152
- f_max: 8000.0
153
- power: 1
154
- normalized: false
155
- norm: slaney
156
- mel_scale: slaney
157
- compression: true
158
-
159
- #model
160
- model: &id002 !new:speechbrain.lobes.models.Tacotron2.Tacotron2
161
-
162
- #optimizer
163
- mask_padding: true
164
- n_mel_channels: 80
165
  # symbols
166
- n_symbols: 148
167
- symbols_embedding_dim: 512
168
  # encoder
169
- encoder_kernel_size: 5
170
- encoder_n_convolutions: 3
171
- encoder_embedding_dim: 512
172
  # attention
173
- attention_rnn_dim: 1024
174
- attention_dim: 128
175
  # attention location
176
- attention_location_n_filters: 32
177
- attention_location_kernel_size: 31
178
  # decoder
179
- n_frames_per_step: 1
180
- decoder_rnn_dim: 1024
181
- prenet_dim: 256
182
- max_decoder_steps: 1000
183
- gate_threshold: 0.5
184
- p_attention_dropout: 0.1
185
- p_decoder_dropout: 0.1
186
  # postnet
187
- postnet_embedding_dim: 512
188
- postnet_kernel_size: 5
189
- postnet_n_convolutions: 5
190
- decoder_no_early_stopping: false
191
-
192
- guided_attention_scheduler: &id001 !new:speechbrain.nnet.schedulers.StepScheduler
193
- initial_value: 50.0
194
- half_life: 10.
195
 
196
- criterion: !new:speechbrain.lobes.models.Tacotron2.Loss
197
- gate_loss_weight: 1.0
198
- guided_attention_weight: 50.0
199
- guided_attention_sigma: 0.2
200
- guided_attention_scheduler: *id001
201
- guided_attention_hard_stop: 50
202
 
203
  modules:
204
- model: *id002
205
- opt_class: !name:torch.optim.Adam
206
- lr: 0.001
207
- weight_decay: 0.000006
208
-
209
- #epoch object
210
- epoch_counter: &id003 !new:speechbrain.utils.epoch_loop.EpochCounter
211
- limit: 1000
212
-
213
- # train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
214
- # save_file: !ref <train_log>
215
- train_logger: !new:speechbrain.utils.train_logger.WandBLogger
216
- initializer: !name:wandb.init
217
- id: tacotron-sk
218
- name: tacotron-sk
219
- entity: sulaiman-kagumire
220
- project: tacotron2-lg-cv
221
- reinit: true
222
- resume: allow
223
-
224
-
225
- #annealing_function
226
- lr_annealing: &id004 !new:speechbrain.nnet.schedulers.IntervalScheduler
227
-
228
- #infer: !name:speechbrain.lobes.models.Tacotron2.infer
229
-
230
- intervals:
231
- - steps: 6000
232
- lr: 0.0005
233
- - steps: 8000
234
- lr: 0.0003
235
- - steps: 10000
236
- lr: 0.0001
237
 
238
- #checkpointer
239
- checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
240
- checkpoints_dir: ./results/tacotron2/1234/save
241
- recoverables:
242
- model: *id002
243
- counter: *id003
244
- scheduler: *id004
245
- progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger
246
- output_path: ./results/tacotron2/1234/samples
247
- batch_sample_size: 3
248
- formats:
249
- raw_batch: raw
 
1
+ ################################
2
+ # Model: Tacotroon2 for TTS
3
+ # Authors: Artem Ploujnikov, Yingzhi Wang
4
+ # ################################
 
 
 
 
 
 
5
 
6
+ mask_padding: True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  n_mel_channels: 80
8
+ n_symbols: 148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  symbols_embedding_dim: 512
 
 
10
  encoder_kernel_size: 5
11
  encoder_n_convolutions: 3
12
  encoder_embedding_dim: 512
13
+ attention_rnn_dim: 1024
14
+ attention_dim: 128
15
+ attention_location_n_filters: 32
16
+ attention_location_kernel_size: 31
17
  n_frames_per_step: 1
18
  decoder_rnn_dim: 1024
19
  prenet_dim: 256
 
21
  gate_threshold: 0.5
22
  p_attention_dropout: 0.1
23
  p_decoder_dropout: 0.1
 
 
 
 
 
 
 
 
 
 
 
24
  postnet_embedding_dim: 512
25
  postnet_kernel_size: 5
26
  postnet_n_convolutions: 5
27
+ decoder_no_early_stopping: False
28
+ sample_rate: 22050
29
 
30
+ # Model
31
+ model: !new:speechbrain.lobes.models.Tacotron2.Tacotron2
32
+ mask_padding: !ref <mask_padding>
33
+ n_mel_channels: !ref <n_mel_channels>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  # symbols
35
+ n_symbols: !ref <n_symbols>
36
+ symbols_embedding_dim: !ref <symbols_embedding_dim>
37
  # encoder
38
+ encoder_kernel_size: !ref <encoder_kernel_size>
39
+ encoder_n_convolutions: !ref <encoder_n_convolutions>
40
+ encoder_embedding_dim: !ref <encoder_embedding_dim>
41
  # attention
42
+ attention_rnn_dim: !ref <attention_rnn_dim>
43
+ attention_dim: !ref <attention_dim>
44
  # attention location
45
+ attention_location_n_filters: !ref <attention_location_n_filters>
46
+ attention_location_kernel_size: !ref <attention_location_kernel_size>
47
  # decoder
48
+ n_frames_per_step: !ref <n_frames_per_step>
49
+ decoder_rnn_dim: !ref <decoder_rnn_dim>
50
+ prenet_dim: !ref <prenet_dim>
51
+ max_decoder_steps: !ref <max_decoder_steps>
52
+ gate_threshold: !ref <gate_threshold>
53
+ p_attention_dropout: !ref <p_attention_dropout>
54
+ p_decoder_dropout: !ref <p_decoder_dropout>
55
  # postnet
56
+ postnet_embedding_dim: !ref <postnet_embedding_dim>
57
+ postnet_kernel_size: !ref <postnet_kernel_size>
58
+ postnet_n_convolutions: !ref <postnet_n_convolutions>
59
+ decoder_no_early_stopping: !ref <decoder_no_early_stopping>
 
 
 
 
60
 
61
+ # Function that converts the text into a sequence of valid characters.
62
+ text_to_sequence: !name:speechbrain.utils.text_to_sequence.text_to_sequence
 
 
 
 
63
 
64
  modules:
65
+ model: !ref <model>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
68
+ loadables:
69
+ model: !ref <model>