ks2303 commited on
Commit
e846e3e
1 Parent(s): 9ce4296

Upload hparams.yaml

Browse files
Files changed (1) hide show
  1. hparams.yaml +212 -0
hparams.yaml ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ############################################################################
2
+ # Model: TransformerTTS
3
+ # Tokens: Phonemes (English)
4
+ # losses: Transducer
5
+ # Training: LJSpeech
6
+ # Author: Kasturi Saha
7
+ # ############################################################################
8
+
9
+
10
+ ###################################
11
+ # Experiment Parameters and setup #
12
+ ###################################
13
+ seed: 1986
14
+ __set_seed: !apply:torch.manual_seed [!ref <seed>]
15
+ output_folder: !ref ./results/transformerTTS/<seed>
16
+ save_folder: !ref <output_folder>/save
17
+ train_log: !ref <output_folder>/train_log.txt
18
+ epochs: 5
19
+ keep_checkpoint_interval: 50
20
+
21
+ ###################################
22
+ # Progress Samples #
23
+ ###################################
24
+ # Progress samples are used to monitor the progress
25
+ # of an ongoing training session by outputting samples
26
+ # of spectrograms, alignments, etc at regular intervals
27
+
28
+ # Whether to enable progress samples
29
+ progress_samples: False
30
+
31
+ # The path where the samples will be stored
32
+ progress_sample_path: !ref <output_folder>/samples
33
+ # The interval, in epochs. For instance, if it is set to 5,
34
+ # progress samples will be output every 5 epochs
35
+ progress_samples_interval: 1
36
+ # The sample size for raw batch samples saved in batch.pth
37
+ # (useful mostly for model debugging)
38
+ progress_batch_sample_size: 3
39
+
40
+ #################################
41
+ # Data files and pre-processing #
42
+ #################################
43
+ data_folder: !ref ./data/LJSpeech-1.1 # e.g, /localscratch/ljspeech
44
+ preprocessed_data_folder: !ref ./data/LJSpeech-1.1/preprocessed/phone_seq # e.g, /localscratch/ljspeech
45
+ preprocessed_melspectrogram_folder: !ref ./data/LJSpeech-1.1/preprocessed/melspectrogram # e.g, /localscratch/ljspeech
46
+
47
+ train_json: !ref ./save/train.json
48
+ valid_json: !ref ./save/valid.json
49
+ test_json: !ref ./save/test.json
50
+
51
+ splits: ["train", "valid", "test"]
52
+ split_ratio: [70, 10, 20]
53
+
54
+ skip_prep: False
55
+
56
+ ################################
57
+ # Audio Parameters #
58
+ ################################
59
+ sample_rate: 22050
60
+ hop_length: 256
61
+ win_length: 1024
62
+ n_mel_channels: 80
63
+ n_fft: 1024
64
+ mel_fmin: 0.0
65
+ mel_fmax: 8000.0
66
+ mel_normalized: False
67
+ power: 1.2
68
+ norm: "slaney"
69
+ mel_scale: "slaney"
70
+ dynamic_range_compression: True
71
+
72
+ ################################
73
+ # Optimization Hyperparameters #
74
+ ################################
75
+ learning_rate: 0.001
76
+ weight_decay: 0.000006
77
+ batch_size: 8 #minimum 2
78
+ num_workers: 0
79
+ mask_padding: True
80
+
81
+ train_dataloader_opts:
82
+ batch_size: !ref <batch_size>
83
+ drop_last: False #True #False
84
+ num_workers: !ref <num_workers>
85
+ collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
86
+
87
+ valid_dataloader_opts:
88
+ batch_size: !ref <batch_size>
89
+ num_workers: !ref <num_workers>
90
+ collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
91
+
92
+ test_dataloader_opts:
93
+ batch_size: !ref <batch_size>
94
+ num_workers: !ref <num_workers>
95
+ collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
96
+
97
+ ################################
98
+ # Model Parameters and model #
99
+ ################################
100
+ n_symbols: 148 #fixed depending on symbols in textToSequence
101
+ symbols_embedding_dim: 512
102
+ hidden_dim: 256
103
+ eprenet_dim: 512
104
+ n_prenet_layers: 3
105
+ dprenet_dim: 256
106
+ postnet_dim: 256
107
+ ff_dim: 1024
108
+ n_heads: 8
109
+ n_layers: 6
110
+ n_postnet_layers: 5
111
+
112
+ # Decoder parameters
113
+ # The number of frames in the target per encoder step
114
+ n_frames_per_step: 1
115
+ decoder_rnn_dim: 1024
116
+ prenet_dim: 256
117
+ max_decoder_steps: 1000
118
+ gate_threshold: 0.5
119
+ p_attention_dropout: 0.1
120
+ p_decoder_dropout: 0.1
121
+ decoder_no_early_stopping: False
122
+
123
+ # Attention parameters
124
+ attention_rnn_dim: 1024
125
+ attention_dim: 128
126
+
127
+ # Location Layer parameters
128
+ attention_location_n_filters: 32
129
+ attention_location_kernel_size: 31
130
+
131
+ # Mel-post processing network parameters
132
+ postnet_embedding_dim: 256
133
+ postnet_kernel_size: 5
134
+ postnet_n_convolutions: 5
135
+
136
+ #model
137
+ model: !new:TransformerTTS.TransformerTTS
138
+ n_mel_channels: !ref <n_mel_channels>
139
+ # symbols
140
+ n_symbols: !ref <n_symbols>
141
+ symbols_embedding_dim: !ref <symbols_embedding_dim>
142
+ eprenet_dim: 512
143
+ n_prenet_layers: 3
144
+ # decoder
145
+ dprenet_dim: !ref <prenet_dim>
146
+ # postnet
147
+ postnet_dim: !ref <postnet_dim>
148
+ hidden_dim: !ref <hidden_dim>
149
+ n_postnet_layers: !ref <n_postnet_layers>
150
+ nhead: !ref <n_heads>
151
+
152
+ guided_attention_sigma: 0.2
153
+ guided_attention_weight: 50.0
154
+ guided_attention_weight_half_life: 10.
155
+ guided_attention_hard_stop: 50
156
+ gate_loss_weight: 1.0
157
+
158
+ guided_attention_scheduler: !new:speechbrain.nnet.schedulers.StepScheduler
159
+ initial_value: !ref <guided_attention_weight>
160
+ half_life: !ref <guided_attention_weight_half_life>
161
+
162
+ criterion: !new:TransformerTTS.Loss
163
+ gate_loss_weight: !ref <gate_loss_weight>
164
+ guided_attention_weight: !ref <guided_attention_weight>
165
+ guided_attention_sigma: !ref <guided_attention_sigma>
166
+ guided_attention_scheduler: !ref <guided_attention_scheduler>
167
+ guided_attention_hard_stop: !ref <guided_attention_hard_stop>
168
+
169
+ modules:
170
+ model: !ref <model>
171
+
172
+ #optimizer
173
+ opt_class: !name:torch.optim.Adam
174
+ lr: !ref <learning_rate>
175
+ weight_decay: !ref <weight_decay>
176
+
177
+ #epoch object
178
+ epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
179
+ limit: !ref <epochs>
180
+
181
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
182
+ save_file: !ref <train_log>
183
+
184
+ #annealing_function
185
+ lr_annealing: !new:speechbrain.nnet.schedulers.IntervalScheduler
186
+ intervals:
187
+ - steps: 6000
188
+ lr: 0.0005
189
+ - steps: 8000
190
+ lr: 0.0003
191
+ - steps: 10000
192
+ lr: 0.0001
193
+
194
+ #checkpointer
195
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
196
+ checkpoints_dir: !ref <save_folder>
197
+ recoverables:
198
+ model: !ref <model>
199
+ counter: !ref <epoch_counter>
200
+ scheduler: !ref <lr_annealing>
201
+
202
+ progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger
203
+ output_path: !ref <progress_sample_path>
204
+ batch_sample_size: !ref <progress_batch_sample_size>
205
+ formats:
206
+ raw_batch: raw
207
+
208
+ max_grad_norm: 1.0
209
+
210
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
211
+ loadables:
212
+ model: !ref <model>