Upload 3 files

d6bd1a2 verified 9 months ago

6.56 kB

	# Generated 2024-03-06 from:
	# /home/marconilab/tacotron2/hparams/train.yaml
	# yamllint disable
	############################################################################
	# Model: Tacotron2
	# Tokens: Raw characters (English text)
	# losses: Transducer
	# Training: LJSpeech
	# Authors: Georges Abous-Rjeili, Artem Ploujnikov, Yingzhi Wang
	# ############################################################################


	###################################
	# Experiment Parameters and setup #
	###################################
	seed: 1234
	__set_seed: !apply:torch.manual_seed [1234]
	output_folder: ./results/tacotron2/1234
	save_folder: ./results/tacotron2/1234/save
	train_log: ./results/tacotron2/1234/train_log.txt
	epochs: 500
	keep_checkpoint_interval: 50
	wandb_id: tacotron2-luganda
	wandb_user: sulaiman-kagumire
	wandb_project: tts-luganda
	init_from_pretrained: true
	###################################
	# Progress Samples #
	###################################
	# Progress samples are used to monitor the progress
	# of an ongoing training session by outputting samples
	# of spectrograms, alignments, etc at regular intervals

	# Whether to enable progress samples
	progress_samples: false

	# The path where the samples will be stored
	progress_sample_path: ./results/tacotron2/1234/samples
	# The interval, in epochs. For instance, if it is set to 5,
	# progress samples will be output every 5 epochs
	progress_samples_interval: 1
	# The sample size for raw batch samples saved in batch.pth
	# (useful mostly for model debugging)
	progress_batch_sample_size: 3

	#################################
	# Data files and pre-processing #
	#################################
	data_folder: data_folder
	# e.g, /localscratch/ljspeech

	train_json: ./results/tacotron2/1234/save/train.json
	valid_json: ./results/tacotron2/1234/save/valid.json
	test_json: ./results/tacotron2/1234/save/test.json

	splits: [train, valid, test]
	split_ratio: [80, 10, 10]

	skip_prep: false

	# Use the original preprocessing from nvidia
	# The cleaners to be used (applicable to nvidia only)
	text_cleaners: [basic_cleaners]

	################################
	# Audio Parameters #
	################################
	sample_rate: 22050
	hop_length: 256
	win_length: 1024
	n_mel_channels: 80
	n_fft: 1024
	mel_fmin: 0.0
	mel_fmax: 8000.0
	mel_normalized: false
	power: 1
	norm: slaney
	mel_scale: slaney
	dynamic_range_compression: true

	################################
	# Optimization Hyperparameters #
	################################
	learning_rate: 0.001
	weight_decay: 0.000006
	batch_size: 256
	num_workers: 8
	mask_padding: true
	guided_attention_sigma: 0.2
	guided_attention_weight: 50.0
	guided_attention_weight_half_life: 10.
	guided_attention_hard_stop: 50
	gate_loss_weight: 1.0

	train_dataloader_opts:
	batch_size: 256
	drop_last: false #True #False
	num_workers: 8
	collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate

	valid_dataloader_opts:
	batch_size: 256
	num_workers: 8
	collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate

	test_dataloader_opts:
	batch_size: 256
	num_workers: 8
	collate_fn: !new:speechbrain.lobes.models.Tacotron2.TextMelCollate

	################################
	# Model Parameters and model #
	################################
	n_symbols: 148 #fixed depending on symbols in textToSequence
	symbols_embedding_dim: 512

	# Encoder parameters
	encoder_kernel_size: 5
	encoder_n_convolutions: 3
	encoder_embedding_dim: 512

	# Decoder parameters
	# The number of frames in the target per encoder step
	n_frames_per_step: 1
	decoder_rnn_dim: 1024
	prenet_dim: 256
	max_decoder_steps: 1000
	gate_threshold: 0.5
	p_attention_dropout: 0.1
	p_decoder_dropout: 0.1
	decoder_no_early_stopping: false

	# Attention parameters
	attention_rnn_dim: 1024
	attention_dim: 128

	# Location Layer parameters
	attention_location_n_filters: 32
	attention_location_kernel_size: 31

	# Mel-post processing network parameters
	postnet_embedding_dim: 512
	postnet_kernel_size: 5
	postnet_n_convolutions: 5

	mel_spectogram: !name:speechbrain.lobes.models.Tacotron2.mel_spectogram
	sample_rate: 22050
	hop_length: 256
	win_length: 1024
	n_fft: 1024
	n_mels: 80
	f_min: 0.0
	f_max: 8000.0
	power: 1
	normalized: false
	norm: slaney
	mel_scale: slaney
	compression: true

	#model
	model: &id002 !new:speechbrain.lobes.models.Tacotron2.Tacotron2

	#optimizer
	mask_padding: true
	n_mel_channels: 80
	# symbols
	n_symbols: 148
	symbols_embedding_dim: 512
	# encoder
	encoder_kernel_size: 5
	encoder_n_convolutions: 3
	encoder_embedding_dim: 512
	# attention
	attention_rnn_dim: 1024
	attention_dim: 128
	# attention location
	attention_location_n_filters: 32
	attention_location_kernel_size: 31
	# decoder
	n_frames_per_step: 1
	decoder_rnn_dim: 1024
	prenet_dim: 256
	max_decoder_steps: 1000
	gate_threshold: 0.5
	p_attention_dropout: 0.1
	p_decoder_dropout: 0.1
	# postnet
	postnet_embedding_dim: 512
	postnet_kernel_size: 5
	postnet_n_convolutions: 5
	decoder_no_early_stopping: false

	guided_attention_scheduler: &id001 !new:speechbrain.nnet.schedulers.StepScheduler
	initial_value: 50.0
	half_life: 10.

	criterion: !new:speechbrain.lobes.models.Tacotron2.Loss
	gate_loss_weight: 1.0
	guided_attention_weight: 50.0
	guided_attention_sigma: 0.2
	guided_attention_scheduler: *id001
	guided_attention_hard_stop: 50

	modules:
	model: *id002
	opt_class: !name:torch.optim.Adam
	lr: 0.001
	weight_decay: 0.000006

	#epoch object
	epoch_counter: &id003 !new:speechbrain.utils.epoch_loop.EpochCounter
	limit: 500

	# train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
	# save_file: !ref <train_log>
	train_logger: !new:speechbrain.utils.train_logger.WandBLogger
	initializer: !name:wandb.init
	# id: !ref <wandb_id>
	name: tacotron2-luganda
	entity: sulaiman-kagumire
	project: tts-luganda
	reinit: true
	# yaml_config: hparams/train.yaml
	resume: allow

	#annealing_function
	lr_annealing: &id004 !new:speechbrain.nnet.schedulers.IntervalScheduler

	#infer: !name:speechbrain.lobes.models.Tacotron2.infer

	intervals:
	- steps: 6000
	lr: 0.0005
	- steps: 8000
	lr: 0.0003
	- steps: 10000
	lr: 0.0001

	#checkpointer
	checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
	checkpoints_dir: ./results/tacotron2/1234/save
	recoverables:
	model: *id002
	counter: *id003
	scheduler: *id004
	progress_sample_logger: !new:speechbrain.utils.train_logger.ProgressSampleLogger
	output_path: ./results/tacotron2/1234/samples
	batch_sample_size: 3
	formats:
	raw_batch: raw