ks2303
/

tts-transformer-ljspeech

+############################################################################
+# Model: TransformerTTS
+# Tokens: Phonemes (English)
+# losses: Transducer
+# Training: LJSpeech
+# Author: Kasturi Saha
+# ############################################################################
+###################################
+# Experiment Parameters and setup #
+###################################
+seed: 1986
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+output_folder: !ref ./results/transformerTTS/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+epochs: 5
+keep_checkpoint_interval: 50
+###################################
+# Progress Samples                #
+###################################
+# Progress samples are used to monitor the progress
+# of an ongoing training session by outputting samples
+# of spectrograms, alignments, etc at regular intervals
+# Whether to enable progress samples
+progress_samples: False
+# The path where the samples will be stored
+progress_sample_path: !ref <output_folder>/samples
+# The interval, in epochs. For instance, if it is set to 5,
+# progress samples will be output every 5 epochs
+progress_samples_interval: 1
+# The sample size for raw batch samples saved in batch.pth
+# (useful mostly for model debugging)
+progress_batch_sample_size: 3
+#################################
+# Data files and pre-processing #
+#################################
+data_folder: !ref ./data/LJSpeech-1.1 # e.g, /localscratch/ljspeech
+preprocessed_data_folder: !ref ./data/LJSpeech-1.1/preprocessed/phone_seq # e.g, /localscratch/ljspeech
+preprocessed_melspectrogram_folder: !ref ./data/LJSpeech-1.1/preprocessed/melspectrogram # e.g, /localscratch/ljspeech
+train_json: !ref ./save/train.json
+valid_json: !ref ./save/valid.json
+test_json: !ref ./save/test.json
+splits: ["train", "valid", "test"]
+split_ratio: [70, 10, 20]
+skip_prep: False
+################################
+# Audio Parameters             #
+################################
+sample_rate: 22050
+hop_length: 256
+win_length: 1024
+n_mel_channels: 80
+n_fft: 1024
+mel_fmin: 0.0
+mel_fmax: 8000.0
+mel_normalized: False
+power: 1.2
+norm: "slaney"
+mel_scale: "slaney"
+dynamic_range_compression: True
+################################
+# Optimization Hyperparameters #
+################################
+learning_rate: 0.001
+weight_decay: 0.000006
+batch_size: 8 #minimum 2
+num_workers: 0
+mask_padding: True
+train_dataloader_opts:
+  batch_size: !ref <batch_size>
+  drop_last: False  #True #False
+  num_workers: !ref <num_workers>
+  collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
+valid_dataloader_opts:
+  batch_size: !ref <batch_size>
+  num_workers: !ref <num_workers>
+  collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
+test_dataloader_opts:
+  batch_size: !ref <batch_size>
+  num_workers: !ref <num_workers>
+  collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate
+################################
+# Model Parameters and model   #
+################################
+n_symbols: 148 #fixed depending on symbols in textToSequence
+symbols_embedding_dim: 512
+hidden_dim: 256
+eprenet_dim: 512
+n_prenet_layers: 3
+dprenet_dim: 256
+postnet_dim: 256
+ff_dim: 1024
+n_heads: 8
+n_layers: 6
+n_postnet_layers: 5
+# Decoder parameters
+# The number of frames in the target per encoder step
+n_frames_per_step: 1
+decoder_rnn_dim: 1024
+prenet_dim: 256
+max_decoder_steps: 1000
+gate_threshold: 0.5
+p_attention_dropout: 0.1
+p_decoder_dropout: 0.1
+decoder_no_early_stopping: False
+# Attention parameters
+attention_rnn_dim: 1024
+attention_dim: 128
+# Location Layer parameters
+attention_location_n_filters: 32
+attention_location_kernel_size: 31
+# Mel-post processing network parameters
+postnet_embedding_dim: 256
+postnet_kernel_size: 5
+postnet_n_convolutions: 5
+#model
+model: !new:TransformerTTS.TransformerTTS
+  n_mel_channels: !ref <n_mel_channels>
+  # symbols
+  n_symbols: !ref <n_symbols>
+  symbols_embedding_dim: !ref <symbols_embedding_dim>
+  eprenet_dim: 512
+  n_prenet_layers: 3
+  # decoder
+  dprenet_dim: !ref <prenet_dim>
+  # postnet
+  postnet_dim: !ref <postnet_dim>
+  hidden_dim: !ref <hidden_dim>
+  n_postnet_layers: !ref <n_postnet_layers>
+  nhead: !ref <n_heads>
+guided_attention_sigma: 0.2
+guided_attention_weight: 50.0
+guided_attention_weight_half_life: 10.
+guided_attention_hard_stop: 50
+gate_loss_weight: 1.0
+guided_attention_scheduler: !new:speechbrain.nnet.schedulers.StepScheduler
+  initial_value: !ref <guided_attention_weight>
+  half_life: !ref <guided_attention_weight_half_life>
+criterion: !new:TransformerTTS.Loss
+  gate_loss_weight: !ref <gate_loss_weight>
+  guided_attention_weight: !ref <guided_attention_weight>
+  guided_attention_sigma: !ref <guided_attention_sigma>
+  guided_attention_scheduler: !ref <guided_attention_scheduler>
+  guided_attention_hard_stop: !ref <guided_attention_hard_stop>
+modules:
+  model: !ref <model>
+#optimizer
+opt_class: !name:torch.optim.Adam
+  lr: !ref <learning_rate>
+  weight_decay: !ref <weight_decay>
+#epoch object
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+  limit: !ref <epochs>
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+  save_file: !ref <train_log>
+#annealing_function
+lr_annealing: !new:speechbrain.nnet.schedulers.IntervalScheduler
+  intervals:
+    - steps: 6000
+      lr: 0.0005
+    - steps: 8000
+      lr: 0.0003
+    - steps: 10000
+      lr: 0.0001
+#checkpointer
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+  checkpoints_dir: !ref <save_folder>
+  recoverables:
+    model: !ref <model>
+    counter: !ref <epoch_counter>
+    scheduler: !ref <lr_annealing>
+progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger
+  output_path: !ref <progress_sample_path>
+  batch_sample_size: !ref <progress_batch_sample_size>
+  formats:
+    raw_batch: raw
+max_grad_norm: 1.0
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    loadables:
+        model: !ref <model>