hf-transformers-bot commited on
Commit
97023e7
1 Parent(s): 1dabe08

Upload tiny models for SeamlessM4Tv2ForTextToSpeech

Browse files
config.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "relu",
4
+ "adaptor_dropout": 0.1,
5
+ "adaptor_kernel_size": 8,
6
+ "adaptor_stride": 8,
7
+ "add_adapter": true,
8
+ "architectures": [
9
+ "SeamlessM4Tv2ForTextToSpeech"
10
+ ],
11
+ "attention_dropout": 0.1,
12
+ "attention_probs_dropout_prob": 0.1,
13
+ "bos_token_id": 2,
14
+ "char_vocab_size": 4,
15
+ "conv_depthwise_kernel_size": 31,
16
+ "decoder_attention_heads": 2,
17
+ "decoder_ffn_dim": 6,
18
+ "decoder_layerdrop": 0.05,
19
+ "decoder_layers": 2,
20
+ "decoder_start_token_id": 3,
21
+ "dropout": 0.1,
22
+ "encoder_attention_heads": 2,
23
+ "encoder_ffn_dim": 6,
24
+ "encoder_layerdrop": 0.05,
25
+ "encoder_layers": 2,
26
+ "eos_token_id": 3,
27
+ "feature_projection_input_dim": 160,
28
+ "hidden_act": "gelu",
29
+ "hidden_dropout_prob": 0.1,
30
+ "hidden_size": 6,
31
+ "initializer_range": 0.02,
32
+ "is_encoder_decoder": true,
33
+ "lang_embed_dim": 6,
34
+ "layer_norm_eps": 1e-05,
35
+ "leaky_relu_slope": 0.1,
36
+ "left_max_position_embeddings": 2,
37
+ "max_new_tokens": null,
38
+ "max_position_embeddings": 256,
39
+ "model_type": "seamless_m4t_v2",
40
+ "num_adapter_layers": 1,
41
+ "num_attention_heads": 2,
42
+ "num_conv_pos_embeddings": 8,
43
+ "num_hidden_layers": 2,
44
+ "pad_token_id": 0,
45
+ "position_embeddings_type": "relative_key",
46
+ "resblock_dilation_sizes": [
47
+ [
48
+ 1,
49
+ 3,
50
+ 5
51
+ ],
52
+ [
53
+ 1,
54
+ 3,
55
+ 5
56
+ ],
57
+ [
58
+ 1,
59
+ 3,
60
+ 5
61
+ ]
62
+ ],
63
+ "resblock_kernel_sizes": [
64
+ 3,
65
+ 7,
66
+ 11
67
+ ],
68
+ "right_max_position_embeddings": 1,
69
+ "sampling_rate": 16000,
70
+ "scale_embedding": true,
71
+ "speech_encoder_attention_heads": 2,
72
+ "speech_encoder_chunk_size": 2,
73
+ "speech_encoder_dropout": 0.0,
74
+ "speech_encoder_hidden_act": "swish",
75
+ "speech_encoder_intermediate_size": 6,
76
+ "speech_encoder_layerdrop": 0.1,
77
+ "speech_encoder_layers": 2,
78
+ "speech_encoder_left_chunk_num": 1,
79
+ "spkr_embed_dim": 6,
80
+ "t2u_bos_token_id": 0,
81
+ "t2u_decoder_attention_heads": 2,
82
+ "t2u_decoder_ffn_dim": 6,
83
+ "t2u_decoder_layers": 2,
84
+ "t2u_encoder_attention_heads": 2,
85
+ "t2u_encoder_ffn_dim": 6,
86
+ "t2u_encoder_layers": 2,
87
+ "t2u_eos_token_id": 2,
88
+ "t2u_max_position_embeddings": 4096,
89
+ "t2u_num_langs": 0,
90
+ "t2u_offset_tgt_lang": 0,
91
+ "t2u_pad_token_id": 1,
92
+ "t2u_variance_pred_dropout": 0.5,
93
+ "t2u_variance_predictor_embed_dim": 6,
94
+ "t2u_variance_predictor_hidden_dim": 4,
95
+ "t2u_variance_predictor_kernel_size": 3,
96
+ "t2u_vocab_size": 20,
97
+ "torch_dtype": "float32",
98
+ "transformers_version": "4.36.0.dev0",
99
+ "unit_embed_dim": 25,
100
+ "unit_hifi_gan_vocab_size": 20,
101
+ "unit_hifigan_vocab_vise": 20,
102
+ "upsample_initial_channel": 32,
103
+ "upsample_kernel_sizes": [
104
+ 11,
105
+ 8,
106
+ 8,
107
+ 4,
108
+ 4
109
+ ],
110
+ "upsample_rates": [
111
+ 5,
112
+ 4,
113
+ 4,
114
+ 2,
115
+ 2
116
+ ],
117
+ "use_cache": true,
118
+ "var_pred_dropout": 0.5,
119
+ "variance_predictor_kernel_size": 3,
120
+ "vocab_size": 20,
121
+ "vocoder_num_langs": 5,
122
+ "vocoder_num_spkrs": 5,
123
+ "vocoder_offset": 0
124
+ }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "decoder_start_token_id": 3,
5
+ "eos_token_id": 3,
6
+ "pad_token_id": 0,
7
+ "transformers_version": "4.36.0.dev0"
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:044af3f5f7ef7075bdbb5ead30d3cfff5bb8b147b17a235dfc6533d1eb75e01e
3
+ size 309184
preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor_type": "SeamlessM4TFeatureExtractor",
3
+ "feature_size": 80,
4
+ "num_mel_bins": 80,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000,
9
+ "stride": 2
10
+ }