File size: 4,447 Bytes
e20eb58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# set random seed, so that you may reproduce your result.
__set_seed1: !apply:random.seed [1986]
__set_seed2: !apply:numpy.random.seed [1986]
__set_seed3: !apply:torch.manual_seed [1986]
__set_seed4: !apply:torch.cuda.manual_seed_all [1986]

# fixed params
sample_rate: 22050
text_encoder_input_size: 512
llm_input_size: 1024
llm_output_size: 1024
spk_embed_dim: 192

# model params
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
# for system/third_party class/function, we do not require this.
llm: !new:cosyvoice.llm.llm.TransformerLM
    text_encoder_input_size: !ref <text_encoder_input_size>
    llm_input_size: !ref <llm_input_size>
    llm_output_size: !ref <llm_output_size>
    text_token_size: 51866
    speech_token_size: 4096
    length_normalized_loss: True
    lsm_weight: 0
    spk_embed_dim: !ref <spk_embed_dim>
    text_encoder: !new:cosyvoice.transformer.encoder.ConformerEncoder
        input_size: !ref <text_encoder_input_size>
        output_size: 1024
        attention_heads: 8
        linear_units: 2048
        num_blocks: 3
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0
        normalize_before: True
        input_layer: 'linear'
        pos_enc_layer_type: 'rel_pos_espnet'
        selfattention_layer_type: 'rel_selfattn'
        use_cnn_module: False
        macaron_style: False
        use_dynamic_chunk: False
        use_dynamic_left_chunk: False
        static_chunk_size: 1
    llm: !new:cosyvoice.transformer.encoder.TransformerEncoder
        input_size: !ref <llm_input_size>
        output_size: !ref <llm_output_size>
        attention_heads: 8
        linear_units: 2048
        num_blocks: 7
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0
        input_layer: 'linear_legacy'
        pos_enc_layer_type: 'rel_pos_espnet'
        selfattention_layer_type: 'rel_selfattn'
        static_chunk_size: 1

flow: !new:cosyvoice.flow.flow.MaskedDiffWithXvec
    input_size: 512
    output_size: 80
    spk_embed_dim: !ref <spk_embed_dim>
    output_type: 'mel'
    vocab_size: 16384
    input_frame_rate: 12.5
    only_mask_loss: True
    encoder: !new:cosyvoice.transformer.encoder.BlockConformerEncoder
        output_size: 512
        attention_heads: 8
        linear_units: 2048
        num_blocks: 6
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        normalize_before: True
        input_layer: 'linear'
        pos_enc_layer_type: 'rel_pos_espnet'
        selfattention_layer_type: 'block_rel_selfattn'
        block_size: 10
        input_size: 512
        use_cnn_module: False
        macaron_style: False
    length_regulator: !new:cosyvoice.flow.length_regulator.InterpolateRegulator
        channels: 80
        sampling_ratios: [1, 1, 1, 1]
    decoder: !new:cosyvoice.flow.flow_matching.ConditionalCFM
        in_channels: 240
        n_spks: 1
        spk_emb_dim: 80
        cfm_params: !new:omegaconf.DictConfig
            content:
                sigma_min: 1e-06
                solver: 'euler'
                t_scheduler: 'cosine'
                training_cfg_rate: 0.2
                inference_cfg_rate: 0.7
                reg_loss_type: 'l1'
        estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
            in_channels: 320
            out_channels: 80
            channels: [256, 256]
            dropout: 0
            attention_head_dim: 64
            n_blocks: 4
            num_mid_blocks: 12
            num_heads: 8
            act_fn: 'gelu'

hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
    in_channels: 80
    base_channels: 512
    nb_harmonics: 8
    sampling_rate: !ref <sample_rate>
    nsf_alpha: 0.1
    nsf_sigma: 0.003
    nsf_voiced_threshold: 10
    upsample_rates: [8, 8]
    upsample_kernel_sizes: [16, 16]
    istft_params:
        n_fft: 16
        hop_len: 4
    resblock_kernel_sizes: [3, 7, 11]
    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
    source_resblock_kernel_sizes: [7, 11]
    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
    lrelu_slope: 0.1
    audio_limit: 0.99
    f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
        num_class: 1
        in_channels: 80
        cond_channels: 512