|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
seed: 1986 |
|
__set_seed: !apply:torch.manual_seed [!ref <seed>] |
|
output_folder: !ref ./results/transformerTTS/<seed> |
|
save_folder: !ref <output_folder>/save |
|
train_log: !ref <output_folder>/train_log.txt |
|
epochs: 5 |
|
keep_checkpoint_interval: 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
progress_samples: False |
|
|
|
|
|
progress_sample_path: !ref <output_folder>/samples |
|
|
|
|
|
progress_samples_interval: 1 |
|
|
|
|
|
progress_batch_sample_size: 3 |
|
|
|
|
|
|
|
|
|
data_folder: !ref ./data/LJSpeech-1.1 |
|
preprocessed_data_folder: !ref ./data/LJSpeech-1.1/preprocessed/phone_seq |
|
preprocessed_melspectrogram_folder: !ref ./data/LJSpeech-1.1/preprocessed/melspectrogram |
|
|
|
train_json: !ref ./save/train.json |
|
valid_json: !ref ./save/valid.json |
|
test_json: !ref ./save/test.json |
|
|
|
splits: ["train", "valid", "test"] |
|
split_ratio: [70, 10, 20] |
|
|
|
skip_prep: False |
|
|
|
|
|
|
|
|
|
sample_rate: 22050 |
|
hop_length: 256 |
|
win_length: 1024 |
|
n_mel_channels: 80 |
|
n_fft: 1024 |
|
mel_fmin: 0.0 |
|
mel_fmax: 8000.0 |
|
mel_normalized: False |
|
power: 1.2 |
|
norm: "slaney" |
|
mel_scale: "slaney" |
|
dynamic_range_compression: True |
|
|
|
|
|
|
|
|
|
learning_rate: 0.001 |
|
weight_decay: 0.000006 |
|
batch_size: 8 |
|
num_workers: 0 |
|
mask_padding: True |
|
|
|
train_dataloader_opts: |
|
batch_size: !ref <batch_size> |
|
drop_last: False |
|
num_workers: !ref <num_workers> |
|
collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate |
|
|
|
valid_dataloader_opts: |
|
batch_size: !ref <batch_size> |
|
num_workers: !ref <num_workers> |
|
collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate |
|
|
|
test_dataloader_opts: |
|
batch_size: !ref <batch_size> |
|
num_workers: !ref <num_workers> |
|
collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate |
|
|
|
|
|
|
|
|
|
n_symbols: 148 |
|
symbols_embedding_dim: 512 |
|
hidden_dim: 256 |
|
eprenet_dim: 512 |
|
n_prenet_layers: 3 |
|
dprenet_dim: 256 |
|
postnet_dim: 256 |
|
ff_dim: 1024 |
|
n_heads: 8 |
|
n_layers: 6 |
|
n_postnet_layers: 5 |
|
|
|
|
|
|
|
n_frames_per_step: 1 |
|
decoder_rnn_dim: 1024 |
|
prenet_dim: 256 |
|
max_decoder_steps: 1000 |
|
gate_threshold: 0.5 |
|
p_attention_dropout: 0.1 |
|
p_decoder_dropout: 0.1 |
|
decoder_no_early_stopping: False |
|
|
|
|
|
attention_rnn_dim: 1024 |
|
attention_dim: 128 |
|
|
|
|
|
attention_location_n_filters: 32 |
|
attention_location_kernel_size: 31 |
|
|
|
|
|
postnet_embedding_dim: 256 |
|
postnet_kernel_size: 5 |
|
postnet_n_convolutions: 5 |
|
|
|
|
|
model: !new:TransformerTTS.TransformerTTS |
|
n_mel_channels: !ref <n_mel_channels> |
|
|
|
n_symbols: !ref <n_symbols> |
|
symbols_embedding_dim: !ref <symbols_embedding_dim> |
|
eprenet_dim: 512 |
|
n_prenet_layers: 3 |
|
|
|
dprenet_dim: !ref <prenet_dim> |
|
|
|
postnet_dim: !ref <postnet_dim> |
|
hidden_dim: !ref <hidden_dim> |
|
n_postnet_layers: !ref <n_postnet_layers> |
|
nhead: !ref <n_heads> |
|
|
|
guided_attention_sigma: 0.2 |
|
guided_attention_weight: 50.0 |
|
guided_attention_weight_half_life: 10. |
|
guided_attention_hard_stop: 50 |
|
gate_loss_weight: 1.0 |
|
|
|
guided_attention_scheduler: !new:speechbrain.nnet.schedulers.StepScheduler |
|
initial_value: !ref <guided_attention_weight> |
|
half_life: !ref <guided_attention_weight_half_life> |
|
|
|
criterion: !new:TransformerTTS.Loss |
|
gate_loss_weight: !ref <gate_loss_weight> |
|
guided_attention_weight: !ref <guided_attention_weight> |
|
guided_attention_sigma: !ref <guided_attention_sigma> |
|
guided_attention_scheduler: !ref <guided_attention_scheduler> |
|
guided_attention_hard_stop: !ref <guided_attention_hard_stop> |
|
|
|
modules: |
|
model: !ref <model> |
|
|
|
|
|
opt_class: !name:torch.optim.Adam |
|
lr: !ref <learning_rate> |
|
weight_decay: !ref <weight_decay> |
|
|
|
|
|
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter |
|
limit: !ref <epochs> |
|
|
|
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger |
|
save_file: !ref <train_log> |
|
|
|
|
|
lr_annealing: !new:speechbrain.nnet.schedulers.IntervalScheduler |
|
intervals: |
|
- steps: 6000 |
|
lr: 0.0005 |
|
- steps: 8000 |
|
lr: 0.0003 |
|
- steps: 10000 |
|
lr: 0.0001 |
|
|
|
|
|
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer |
|
checkpoints_dir: !ref <save_folder> |
|
recoverables: |
|
model: !ref <model> |
|
counter: !ref <epoch_counter> |
|
scheduler: !ref <lr_annealing> |
|
|
|
progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger |
|
output_path: !ref <progress_sample_path> |
|
batch_sample_size: !ref <progress_batch_sample_size> |
|
formats: |
|
raw_batch: raw |
|
|
|
max_grad_norm: 1.0 |
|
|
|
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
|
loadables: |
|
model: !ref <model> |
|
|