dangvansam commited on
Commit
a3e9201
1 Parent(s): 391d5a4

Upload 6 files

Browse files
Files changed (6) hide show
  1. config.yaml +129 -0
  2. flow.pt +3 -0
  3. hift.pt +3 -0
  4. llm.pt +3 -0
  5. speech_embedding.onnx +3 -0
  6. speech_tokenizer.onnx +3 -0
config.yaml ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __set_seed1: !apply:random.seed [1986]
2
+ __set_seed2: !apply:numpy.random.seed [1986]
3
+ __set_seed3: !apply:torch.manual_seed [1986]
4
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
5
+
6
+ sample_rate: 22050
7
+ text_encoder_input_size: 512
8
+ llm_input_size: 1024
9
+ llm_output_size: 1024
10
+ spk_embed_dim: 192
11
+
12
+ llm: !new:viettts.llm.llm.TransformerLM
13
+ text_encoder_input_size: !ref <text_encoder_input_size>
14
+ llm_input_size: !ref <llm_input_size>
15
+ llm_output_size: !ref <llm_output_size>
16
+ text_token_size: 60515
17
+ speech_token_size: 4096
18
+ length_normalized_loss: True
19
+ lsm_weight: 0
20
+ spk_embed_dim: !ref <spk_embed_dim>
21
+ text_encoder: !new:viettts.transformer.encoder.ConformerEncoder
22
+ input_size: !ref <text_encoder_input_size>
23
+ output_size: 1024
24
+ attention_heads: 16
25
+ linear_units: 4096
26
+ num_blocks: 6
27
+ dropout_rate: 0.1
28
+ positional_dropout_rate: 0.1
29
+ attention_dropout_rate: 0.0
30
+ normalize_before: True
31
+ input_layer: 'linear'
32
+ pos_enc_layer_type: 'rel_pos_espnet'
33
+ selfattention_layer_type: 'rel_selfattn'
34
+ use_cnn_module: False
35
+ macaron_style: False
36
+ use_dynamic_chunk: False
37
+ use_dynamic_left_chunk: False
38
+ static_chunk_size: 1
39
+ llm: !new:viettts.transformer.encoder.TransformerEncoder
40
+ input_size: !ref <llm_input_size>
41
+ output_size: !ref <llm_output_size>
42
+ attention_heads: 16
43
+ linear_units: 4096
44
+ num_blocks: 14
45
+ dropout_rate: 0.1
46
+ positional_dropout_rate: 0.1
47
+ attention_dropout_rate: 0.0
48
+ input_layer: 'linear_legacy'
49
+ pos_enc_layer_type: 'rel_pos_espnet'
50
+ selfattention_layer_type: 'rel_selfattn'
51
+ static_chunk_size: 1
52
+ sampling: !name:viettts.utils.common.ras_sampling
53
+ top_p: 0.8
54
+ top_k: 25
55
+ win_size: 10
56
+ tau_r: 0.1
57
+
58
+ flow: !new:viettts.flow.flow.MaskedDiffWithXvec
59
+ input_size: 512
60
+ output_size: 80
61
+ spk_embed_dim: !ref <spk_embed_dim>
62
+ output_type: 'mel'
63
+ vocab_size: 4096
64
+ input_frame_rate: 25
65
+ only_mask_loss: True
66
+ encoder: !new:viettts.transformer.encoder.ConformerEncoder
67
+ output_size: 512
68
+ attention_heads: 8
69
+ linear_units: 2048
70
+ num_blocks: 6
71
+ dropout_rate: 0.1
72
+ positional_dropout_rate: 0.1
73
+ attention_dropout_rate: 0.1
74
+ normalize_before: True
75
+ input_layer: 'linear'
76
+ pos_enc_layer_type: 'rel_pos_espnet'
77
+ selfattention_layer_type: 'rel_selfattn'
78
+ input_size: 512
79
+ use_cnn_module: False
80
+ macaron_style: False
81
+ length_regulator: !new:viettts.flow.length_regulator.InterpolateRegulator
82
+ channels: 80
83
+ sampling_ratios: [1, 1, 1, 1]
84
+ decoder: !new:viettts.flow.flow_matching.ConditionalCFM
85
+ in_channels: 240
86
+ n_spks: 1
87
+ spk_emb_dim: 80
88
+ cfm_params: !new:omegaconf.DictConfig
89
+ content:
90
+ sigma_min: 1e-06
91
+ solver: 'euler'
92
+ t_scheduler: 'cosine'
93
+ training_cfg_rate: 0.2
94
+ inference_cfg_rate: 0.7
95
+ reg_loss_type: 'l1'
96
+ estimator: !new:viettts.flow.decoder.ConditionalDecoder
97
+ in_channels: 320
98
+ out_channels: 80
99
+ channels: [256, 256]
100
+ dropout: 0.0
101
+ attention_head_dim: 64
102
+ n_blocks: 4
103
+ num_mid_blocks: 12
104
+ num_heads: 8
105
+ act_fn: 'gelu'
106
+
107
+ hift: !new:viettts.hifigan.generator.HiFTGenerator
108
+ in_channels: 80
109
+ base_channels: 512
110
+ nb_harmonics: 8
111
+ sampling_rate: !ref <sample_rate>
112
+ nsf_alpha: 0.1
113
+ nsf_sigma: 0.003
114
+ nsf_voiced_threshold: 10
115
+ upsample_rates: [8, 8]
116
+ upsample_kernel_sizes: [16, 16]
117
+ istft_params:
118
+ n_fft: 16
119
+ hop_len: 4
120
+ resblock_kernel_sizes: [3, 7, 11]
121
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
122
+ source_resblock_kernel_sizes: [7, 11]
123
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
124
+ lrelu_slope: 0.1
125
+ audio_limit: 0.99
126
+ f0_predictor: !new:viettts.hifigan.f0_predictor.ConvRNNF0Predictor
127
+ num_class: 1
128
+ in_channels: 80
129
+ cond_channels: 512
flow.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1411de192039a21d53f0bf1968feb50586ce71d81ea1443f8163f4d1c46c5455
3
+ size 419901370
hift.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91e679b6ca1eff71187ffb4f3ab0444935594cdcc20a9bd12afad111ef8d6012
3
+ size 81896716
llm.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1773e5afe16a88ee82e33cf510a07717ce1346d2e74856733d72dc297a9a017
3
+ size 1260740644
speech_embedding.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73
3
+ size 28303423
speech_tokenizer.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56285ddd4a83e883ee0cb9f8d69c1089b53a94b1f78ff7e4a0224a27eb4cb486
3
+ size 522625011