File size: 3,189 Bytes
c29beed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
{
    "gpt": {
        "gpt_max_audio_tokens": 630,
        "gpt_max_text_tokens": 402,
        "gpt_max_prompt_tokens": 70,
        "gpt_layers": 30,
        "gpt_n_model_channels": 1024,
        "gpt_n_heads": 16,
        "gpt_number_text_tokens": 57341,
        "gpt_start_text_token": 57187,
        "gpt_stop_text_token": 57184,
        "gpt_num_audio_tokens": 16386,
        "gpt_start_audio_token": 16384,
        "gpt_stop_audio_token": 16385,
        "gpt_code_stride_len": 640,
        "duration_const": 102400,
        "min_conditioning_length": 48000,
        "max_conditioning_length": 128000,
        "max_wav_length": 320000,
        "max_text_length": 200
    },
    "flow": {
        "output_size": 100,
        "input_embedding": {
            "out_channels": 512,
            "codebook_path": "fireredtts/modules/flow/codebook.npy",
            "freeze": true
        },
        "encoder": {
            "input_size": 512,
            "output_size": 512,
            "attention_heads": 8,
            "linear_units": 2048,
            "num_blocks": 6,
            "dropout_rate": 0.01,
            "srcattention_start_index": 0,
            "srcattention_end_index": 2,
            "attention_dropout_rate": 0.01,
            "positional_dropout_rate": 0.01,
            "key_bias": true,
            "normalize_before": true
        },
        "length_regulator": {
            "channels": 512,
            "num_blocks": 4
        },
        "mel_encoder": {
            "in_channels": 100,
            "out_channels": 512,
            "hidden_channels": 384,
            "reduction_rate": 4,
            "n_layers": 2,
            "n_blocks": 5,
            "kernel_size": 3
        },
        "decoder": {
            "t_scheduler": "cosine",
            "inference_cfg_rate": 0.7,
            "estimator": {
                "in_channels": 200,
                "out_channels": 100,
                "channels": [
                    256,
                    256
                ],
                "dropout": 0,
                "attention_head_dim": 64,
                "n_blocks": 4,
                "num_mid_blocks": 12,
                "num_heads": 8,
                "act_fn": "gelu"
            }
        }
    },
    "bigvgan": {
        "num_mels": 100,
        "upsample_initial_channel": 1536,
        "upsample_rates": [
            5,
            3,
            2,
            2,
            2,
            2
        ],
        "upsample_kernel_sizes": [
            11,
            7,
            4,
            4,
            4,
            4
        ],
        "resblock_kernel_sizes": [
            3,
            7,
            11
        ],
        "resblock_dilation_sizes": [
            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ],
            [
                1,
                3,
                5
            ]
        ],
        "resblock_type": "1",
        "snake_logscale": true,
        "activation": "snakebeta",
        "use_tanh_at_final": false,
        "use_bias_at_final": false,
        "use_cuda_kernel": false
    }
}