File size: 2,563 Bytes
8c92a11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
{
"base_config": "config/base.json",
"model_type": "NaturalSpeech2",
"dataset": ["libritts"],
"preprocess": {
"use_mel": false,
"use_code": true,
"use_spkid": true,
"use_pitch": true,
"use_duration": true,
"use_phone": true,
"use_len": true,
"use_cross_reference": true,
"train_file": "train.json",
"melspec_dir": "mel",
"code_dir": "code",
"pitch_dir": "pitch",
"duration_dir": "duration",
"clip_mode": "start"
},
"model": {
"latent_dim": 128,
"prior_encoder": {
"vocab_size": 100,
"pitch_min": 50,
"pitch_max": 1100,
"pitch_bins_num": 512,
"encoder": {
"encoder_layer": 6,
"encoder_hidden": 512,
"encoder_head": 8,
"conv_filter_size": 2048,
"conv_kernel_size": 9,
"encoder_dropout": 0.2,
"use_cln": true
},
"duration_predictor": {
"input_size": 512,
"filter_size": 512,
"kernel_size": 3,
"conv_layers": 30,
"cross_attn_per_layer": 3,
"attn_head": 8,
"drop_out": 0.5
},
"pitch_predictor": {
"input_size": 512,
"filter_size": 512,
"kernel_size": 5,
"conv_layers": 30,
"cross_attn_per_layer": 3,
"attn_head": 8,
"drop_out": 0.5
}
},
"diffusion": {
"wavenet": {
"input_size": 128,
"hidden_size": 512,
"out_size": 128,
"num_layers": 40,
"cross_attn_per_layer": 3,
"dilation_cycle": 2,
"attn_head": 8,
"drop_out": 0.2
},
"beta_min": 0.05,
"beta_max": 20,
"sigma": 1.0,
"noise_factor": 1.0,
"ode_solver": "euler"
},
"prompt_encoder": {
"encoder_layer": 6,
"encoder_hidden": 512,
"encoder_head": 8,
"conv_filter_size": 2048,
"conv_kernel_size": 9,
"encoder_dropout": 0.2,
"use_cln": false
},
"query_emb": {
"query_token_num": 32,
"hidden_size": 512,
"head_num": 8
}
}
} |