File size: 2,186 Bytes
8c92a11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
{
"base_config": "egs/vocoder/gan/exp_config_base.json",
"model_type": "GANVocoder",
"dataset": [
"ljspeech",
"vctk",
"libritts",
],
"dataset_path": {
// TODO: Fill in your dataset path
"ljspeech": "[dataset path]",
"vctk": "[dataset path]",
"libritts": "[dataset path]",
},
// TODO: Fill in the output log path. The default value is "Amphion/ckpts/vocoder"
"log_dir": "ckpts/vocoder",
"preprocess": {
// TODO: Fill in the output data path. The default value is "Amphion/data"
"processed_dir": "data",
// acoustic features
"extract_mel": true,
"extract_audio": true,
"extract_pitch": false,
"extract_uv": false,
"extract_amplitude_phase": false,
"pitch_extractor": "parselmouth",
// Features used for model training
"use_mel": true,
"use_frame_pitch": false,
"use_uv": false,
"use_audio": true,
"n_mel": 100,
"sample_rate": 24000
},
"model": {
"generator": "hifigan",
"discriminators": [
"msd",
"mpd",
"mssbcqtd",
"msstftd",
],
"hifigan": {
"resblock": "1",
"upsample_rates": [
8,
4,
2,
2,
2
],
"upsample_kernel_sizes": [
16,
8,
4,
4,
4
],
"upsample_initial_channel": 768,
"resblock_kernel_sizes": [
3,
5,
7
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
]
},
"mpd": {
"mpd_reshapes": [
2,
3,
5,
7,
11,
17,
23,
37
],
"use_spectral_norm": false,
"discriminator_channel_multi": 1
}
},
"train": {
"batch_size": 32,
"adamw": {
"lr": 2.0e-4,
"adam_b1": 0.8,
"adam_b2": 0.99
},
"exponential_lr": {
"lr_decay": 0.999
},
"criterions": [
"feature",
"discriminator",
"generator",
"mel",
]
},
"inference": {
"batch_size": 1,
}
} |