FireRedTTS / configs /config_24k.json
hhguo's picture
remove .DS_Store and add icons
c29beed
{
"gpt": {
"gpt_max_audio_tokens": 630,
"gpt_max_text_tokens": 402,
"gpt_max_prompt_tokens": 70,
"gpt_layers": 30,
"gpt_n_model_channels": 1024,
"gpt_n_heads": 16,
"gpt_number_text_tokens": 57341,
"gpt_start_text_token": 57187,
"gpt_stop_text_token": 57184,
"gpt_num_audio_tokens": 16386,
"gpt_start_audio_token": 16384,
"gpt_stop_audio_token": 16385,
"gpt_code_stride_len": 640,
"duration_const": 102400,
"min_conditioning_length": 48000,
"max_conditioning_length": 128000,
"max_wav_length": 320000,
"max_text_length": 200
},
"flow": {
"output_size": 100,
"input_embedding": {
"out_channels": 512,
"codebook_path": "fireredtts/modules/flow/codebook.npy",
"freeze": true
},
"encoder": {
"input_size": 512,
"output_size": 512,
"attention_heads": 8,
"linear_units": 2048,
"num_blocks": 6,
"dropout_rate": 0.01,
"srcattention_start_index": 0,
"srcattention_end_index": 2,
"attention_dropout_rate": 0.01,
"positional_dropout_rate": 0.01,
"key_bias": true,
"normalize_before": true
},
"length_regulator": {
"channels": 512,
"num_blocks": 4
},
"mel_encoder": {
"in_channels": 100,
"out_channels": 512,
"hidden_channels": 384,
"reduction_rate": 4,
"n_layers": 2,
"n_blocks": 5,
"kernel_size": 3
},
"decoder": {
"t_scheduler": "cosine",
"inference_cfg_rate": 0.7,
"estimator": {
"in_channels": 200,
"out_channels": 100,
"channels": [
256,
256
],
"dropout": 0,
"attention_head_dim": 64,
"n_blocks": 4,
"num_mid_blocks": 12,
"num_heads": 8,
"act_fn": "gelu"
}
}
},
"bigvgan": {
"num_mels": 100,
"upsample_initial_channel": 1536,
"upsample_rates": [
5,
3,
2,
2,
2,
2
],
"upsample_kernel_sizes": [
11,
7,
4,
4,
4,
4
],
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"resblock_type": "1",
"snake_logscale": true,
"activation": "snakebeta",
"use_tanh_at_final": false,
"use_bias_at_final": false,
"use_cuda_kernel": false
}
}