dangvansam
/

viet-tts

Model card Files Files and versions Community

viet-tts / config.yaml

dangvansam's picture

Upload 6 files

a3e9201 verified about 1 month ago

history blame contribute delete

4.2 kB

	__set_seed1: !apply:random.seed [1986]
	__set_seed2: !apply:numpy.random.seed [1986]
	__set_seed3: !apply:torch.manual_seed [1986]
	__set_seed4: !apply:torch.cuda.manual_seed_all [1986]

	sample_rate: 22050
	text_encoder_input_size: 512
	llm_input_size: 1024
	llm_output_size: 1024
	spk_embed_dim: 192

	llm: !new:viettts.llm.llm.TransformerLM
	text_encoder_input_size: !ref <text_encoder_input_size>
	llm_input_size: !ref <llm_input_size>
	llm_output_size: !ref <llm_output_size>
	text_token_size: 60515
	speech_token_size: 4096
	length_normalized_loss: True
	lsm_weight: 0
	spk_embed_dim: !ref <spk_embed_dim>
	text_encoder: !new:viettts.transformer.encoder.ConformerEncoder
	input_size: !ref <text_encoder_input_size>
	output_size: 1024
	attention_heads: 16
	linear_units: 4096
	num_blocks: 6
	dropout_rate: 0.1
	positional_dropout_rate: 0.1
	attention_dropout_rate: 0.0
	normalize_before: True
	input_layer: 'linear'
	pos_enc_layer_type: 'rel_pos_espnet'
	selfattention_layer_type: 'rel_selfattn'
	use_cnn_module: False
	macaron_style: False
	use_dynamic_chunk: False
	use_dynamic_left_chunk: False
	static_chunk_size: 1
	llm: !new:viettts.transformer.encoder.TransformerEncoder
	input_size: !ref <llm_input_size>
	output_size: !ref <llm_output_size>
	attention_heads: 16
	linear_units: 4096
	num_blocks: 14
	dropout_rate: 0.1
	positional_dropout_rate: 0.1
	attention_dropout_rate: 0.0
	input_layer: 'linear_legacy'
	pos_enc_layer_type: 'rel_pos_espnet'
	selfattention_layer_type: 'rel_selfattn'
	static_chunk_size: 1
	sampling: !name:viettts.utils.common.ras_sampling
	top_p: 0.8
	top_k: 25
	win_size: 10
	tau_r: 0.1

	flow: !new:viettts.flow.flow.MaskedDiffWithXvec
	input_size: 512
	output_size: 80
	spk_embed_dim: !ref <spk_embed_dim>
	output_type: 'mel'
	vocab_size: 4096
	input_frame_rate: 25
	only_mask_loss: True
	encoder: !new:viettts.transformer.encoder.ConformerEncoder
	output_size: 512
	attention_heads: 8
	linear_units: 2048
	num_blocks: 6
	dropout_rate: 0.1
	positional_dropout_rate: 0.1
	attention_dropout_rate: 0.1
	normalize_before: True
	input_layer: 'linear'
	pos_enc_layer_type: 'rel_pos_espnet'
	selfattention_layer_type: 'rel_selfattn'
	input_size: 512
	use_cnn_module: False
	macaron_style: False
	length_regulator: !new:viettts.flow.length_regulator.InterpolateRegulator
	channels: 80
	sampling_ratios: [1, 1, 1, 1]
	decoder: !new:viettts.flow.flow_matching.ConditionalCFM
	in_channels: 240
	n_spks: 1
	spk_emb_dim: 80
	cfm_params: !new:omegaconf.DictConfig
	content:
	sigma_min: 1e-06
	solver: 'euler'
	t_scheduler: 'cosine'
	training_cfg_rate: 0.2
	inference_cfg_rate: 0.7
	reg_loss_type: 'l1'
	estimator: !new:viettts.flow.decoder.ConditionalDecoder
	in_channels: 320
	out_channels: 80
	channels: [256, 256]
	dropout: 0.0
	attention_head_dim: 64
	n_blocks: 4
	num_mid_blocks: 12
	num_heads: 8
	act_fn: 'gelu'

	hift: !new:viettts.hifigan.generator.HiFTGenerator
	in_channels: 80
	base_channels: 512
	nb_harmonics: 8
	sampling_rate: !ref <sample_rate>
	nsf_alpha: 0.1
	nsf_sigma: 0.003
	nsf_voiced_threshold: 10
	upsample_rates: [8, 8]
	upsample_kernel_sizes: [16, 16]
	istft_params:
	n_fft: 16
	hop_len: 4
	resblock_kernel_sizes: [3, 7, 11]
	resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
	source_resblock_kernel_sizes: [7, 11]
	source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
	lrelu_slope: 0.1
	audio_limit: 0.99
	f0_predictor: !new:viettts.hifigan.f0_predictor.ConvRNNF0Predictor
	num_class: 1
	in_channels: 80
	cond_channels: 512