dangvansam commited on
Commit
391d5a4
·
verified ·
1 Parent(s): 9f7647f

Delete pretrained-models

Browse files
pretrained-models/config.yaml DELETED
@@ -1,129 +0,0 @@
1
- __set_seed1: !apply:random.seed [1986]
2
- __set_seed2: !apply:numpy.random.seed [1986]
3
- __set_seed3: !apply:torch.manual_seed [1986]
4
- __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
5
-
6
- sample_rate: 22050
7
- text_encoder_input_size: 512
8
- llm_input_size: 1024
9
- llm_output_size: 1024
10
- spk_embed_dim: 192
11
-
12
- llm: !new:src.llm.llm.TransformerLM
13
- text_encoder_input_size: !ref <text_encoder_input_size>
14
- llm_input_size: !ref <llm_input_size>
15
- llm_output_size: !ref <llm_output_size>
16
- text_token_size: 60515
17
- speech_token_size: 4096
18
- length_normalized_loss: True
19
- lsm_weight: 0
20
- spk_embed_dim: !ref <spk_embed_dim>
21
- text_encoder: !new:src.transformer.encoder.ConformerEncoder
22
- input_size: !ref <text_encoder_input_size>
23
- output_size: 1024
24
- attention_heads: 16
25
- linear_units: 4096
26
- num_blocks: 6
27
- dropout_rate: 0.1
28
- positional_dropout_rate: 0.1
29
- attention_dropout_rate: 0.0
30
- normalize_before: True
31
- input_layer: 'linear'
32
- pos_enc_layer_type: 'rel_pos_espnet'
33
- selfattention_layer_type: 'rel_selfattn'
34
- use_cnn_module: False
35
- macaron_style: False
36
- use_dynamic_chunk: False
37
- use_dynamic_left_chunk: False
38
- static_chunk_size: 1
39
- llm: !new:src.transformer.encoder.TransformerEncoder
40
- input_size: !ref <llm_input_size>
41
- output_size: !ref <llm_output_size>
42
- attention_heads: 16
43
- linear_units: 4096
44
- num_blocks: 14
45
- dropout_rate: 0.1
46
- positional_dropout_rate: 0.1
47
- attention_dropout_rate: 0.0
48
- input_layer: 'linear_legacy'
49
- pos_enc_layer_type: 'rel_pos_espnet'
50
- selfattention_layer_type: 'rel_selfattn'
51
- static_chunk_size: 1
52
- sampling: !name:src.utils.common.ras_sampling
53
- top_p: 0.8
54
- top_k: 25
55
- win_size: 10
56
- tau_r: 0.1
57
-
58
- flow: !new:src.flow.flow.MaskedDiffWithXvec
59
- input_size: 512
60
- output_size: 80
61
- spk_embed_dim: !ref <spk_embed_dim>
62
- output_type: 'mel'
63
- vocab_size: 4096
64
- input_frame_rate: 25
65
- only_mask_loss: True
66
- encoder: !new:src.transformer.encoder.ConformerEncoder
67
- output_size: 512
68
- attention_heads: 8
69
- linear_units: 2048
70
- num_blocks: 6
71
- dropout_rate: 0.1
72
- positional_dropout_rate: 0.1
73
- attention_dropout_rate: 0.1
74
- normalize_before: True
75
- input_layer: 'linear'
76
- pos_enc_layer_type: 'rel_pos_espnet'
77
- selfattention_layer_type: 'rel_selfattn'
78
- input_size: 512
79
- use_cnn_module: False
80
- macaron_style: False
81
- length_regulator: !new:src.flow.length_regulator.InterpolateRegulator
82
- channels: 80
83
- sampling_ratios: [1, 1, 1, 1]
84
- decoder: !new:src.flow.flow_matching.ConditionalCFM
85
- in_channels: 240
86
- n_spks: 1
87
- spk_emb_dim: 80
88
- cfm_params: !new:omegaconf.DictConfig
89
- content:
90
- sigma_min: 1e-06
91
- solver: 'euler'
92
- t_scheduler: 'cosine'
93
- training_cfg_rate: 0.2
94
- inference_cfg_rate: 0.7
95
- reg_loss_type: 'l1'
96
- estimator: !new:src.flow.decoder.ConditionalDecoder
97
- in_channels: 320
98
- out_channels: 80
99
- channels: [256, 256]
100
- dropout: 0.0
101
- attention_head_dim: 64
102
- n_blocks: 4
103
- num_mid_blocks: 12
104
- num_heads: 8
105
- act_fn: 'gelu'
106
-
107
- hift:
108
- in_channels: 80
109
- base_channels: 512
110
- nb_harmonics: 8
111
- sampling_rate: !ref <sample_rate>
112
- nsf_alpha: 0.1
113
- nsf_sigma: 0.003
114
- nsf_voiced_threshold: 10
115
- upsample_rates: [8, 8]
116
- upsample_kernel_sizes: [16, 16]
117
- istft_params:
118
- n_fft: 16
119
- hop_len: 4
120
- resblock_kernel_sizes: [3, 7, 11]
121
- resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
122
- source_resblock_kernel_sizes: [7, 11]
123
- source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
124
- lrelu_slope: 0.1
125
- audio_limit: 0.99
126
- f0_predictor: !new:src.hifigan.f0_predictor.ConvRNNF0Predictor
127
- num_class: 1
128
- in_channels: 80
129
- cond_channels: 512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pretrained-models/flow.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1411de192039a21d53f0bf1968feb50586ce71d81ea1443f8163f4d1c46c5455
3
- size 419901370
 
 
 
 
pretrained-models/hift.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:91e679b6ca1eff71187ffb4f3ab0444935594cdcc20a9bd12afad111ef8d6012
3
- size 81896716
 
 
 
 
pretrained-models/llm.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1773e5afe16a88ee82e33cf510a07717ce1346d2e74856733d72dc297a9a017
3
- size 1260740644
 
 
 
 
pretrained-models/speech_embedding.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
- size 28303423
 
 
 
 
pretrained-models/speech_tokenizer.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:56285ddd4a83e883ee0cb9f8d69c1089b53a94b1f78ff7e4a0224a27eb4cb486
3
- size 522625011