File size: 2,625 Bytes
8c92a11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
{
"base_config": "config/tts.json",
"model_type": "VALLE",
"task_type": "tts",
"dataset": [
"libritts"
],
"preprocess": {
"extract_phone": true,
"phone_extractor": "espeak", // phoneme extractor: espeak, pypinyin, pypinyin_initials_finals or lexicon
"extract_acoustic_token": true,
"acoustic_token_extractor": "Encodec", // acoustic token extractor: encodec, dac(todo)
"acoustic_token_dir": "acoutic_tokens",
"use_text": false,
"use_phone": true,
"use_acoustic_token": true,
"symbols_dict": "symbols.dict",
"min_duration": 0.5, // the duration lowerbound to filter the audio with duration < min_duration
"max_duration": 14, // the duration uperbound to filter the audio with duration > max_duration.
"sample_rate": 24000,
"codec_hop_size": 320
},
"model": {
"text_token_num": 512,
"audio_token_num": 1024,
"decoder_dim": 1024, // embedding dimension of the decoder model
"nhead": 16, // number of attention heads in the decoder layers
"num_decoder_layers": 12, // number of decoder layers
"norm_first": true, // pre or post Normalization.
"add_prenet": false, // whether add PreNet after Inputs
"prefix_mode": 0, // mode for how to prefix VALL-E NAR Decoder, 0: no prefix, 1: 0 to random, 2: random to random, 4: chunk of pre or post utterance
"share_embedding": true, // share the parameters of the output projection layer with the parameters of the acoustic embedding
"nar_scale_factor": 1, // model scale factor which will be assigned different meanings in different models
"prepend_bos": false, // whether prepend <BOS> to the acoustic tokens -> AR Decoder inputs
"num_quantizers": 8, // numbert of the audio quantization layers
// "scaling_xformers": false, // Apply Reworked Conformer scaling on Transformers
},
"train": {
"use_dynamic_batchsize": false, // If use dynamic batch size
"ddp": false,
"train_stage": 1, // 0: train all modules, For VALL_E, support 1: AR Decoder 2: NAR Decoder(s)
"max_epoch": 20,
"optimizer": "AdamW",
"scheduler": "cosine",
"warmup_steps": 16000, // number of steps that affects how rapidly the learning rate decreases
"total_training_steps": 800000,
"base_lr": 1e-4, // base learning rate."
"valid_interval": 1000,
"log_epoch_step": 1000,
"save_checkpoint_stride": [
1,
1
]
}
}
|