yuancwang
init
5548515
{
"base_config": "config/base.json",
"model_type": "NaturalSpeech2",
"dataset": ["LibriTTS"],
"preprocess": {
"use_mel": false,
"use_code": true,
"use_spkid": true,
"use_pitch": true,
"use_duration": true,
"use_phone": true,
"use_len": true,
"use_cross_reference": true,
"train_file": "train.json",
"melspec_dir": "mel",
"code_dir": "code",
"pitch_dir": "pitch",
"duration_dir": "duration",
"clip_mode": "start"
},
"model": {
"latent_dim": 128,
"prior_encoder": {
"vocab_size": 100,
"pitch_min": 50,
"pitch_max": 1100,
"pitch_bins_num": 512,
"encoder": {
"encoder_layer": 6,
"encoder_hidden": 512,
"encoder_head": 8,
"conv_filter_size": 2048,
"conv_kernel_size": 9,
"encoder_dropout": 0.2,
"use_cln": true
},
"duration_predictor": {
"input_size": 512,
"filter_size": 512,
"kernel_size": 3,
"conv_layers": 30,
"cross_attn_per_layer": 3,
"attn_head": 8,
"drop_out": 0.5
},
"pitch_predictor": {
"input_size": 512,
"filter_size": 512,
"kernel_size": 5,
"conv_layers": 30,
"cross_attn_per_layer": 3,
"attn_head": 8,
"drop_out": 0.5
}
},
"diffusion": {
"wavenet": {
"input_size": 128,
"hidden_size": 512,
"out_size": 128,
"num_layers": 40,
"cross_attn_per_layer": 3,
"dilation_cycle": 2,
"attn_head": 8,
"drop_out": 0.2
},
"beta_min": 0.05,
"beta_max": 20,
"sigma": 1.0,
"noise_factor": 1.0,
"ode_solver": "euler"
},
"prompt_encoder": {
"encoder_layer": 6,
"encoder_hidden": 512,
"encoder_head": 8,
"conv_filter_size": 2048,
"conv_kernel_size": 9,
"encoder_dropout": 0.2,
"use_cln": false
},
"query_emb": {
"query_token_num": 32,
"hidden_size": 512,
"head_num": 8
}
}
}