nisten commited on
Commit
deb6397
1 Parent(s): a969a28

Add files using upload-large-folder tool

Browse files
LICENSE ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2024 APOLLO-DEEZE-NUTZ
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8
+
9
+
config.json ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ApolloForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "attn_implementation": "flash_attention_2",
7
+ "clip_duration": 2,
8
+ "drop_path_rate": 0.0,
9
+ "encode_batch_size": 15,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 3584,
12
+ "image_aspect_ratio": "square",
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 18944,
15
+ "interpolate_mode": "linear",
16
+ "llm_cfg": {
17
+ "add_cross_attention": false,
18
+ "architectures": [
19
+ "Qwen2ForCausalLM"
20
+ ],
21
+ "attention_dropout": 0.0,
22
+ "bad_words_ids": null,
23
+ "begin_suppress_tokens": null,
24
+ "bos_token_id": 151643,
25
+ "chunk_size_feed_forward": 0,
26
+ "cross_attention_hidden_size": null,
27
+ "decoder_start_token_id": null,
28
+ "diversity_penalty": 0.0,
29
+ "do_sample": false,
30
+ "early_stopping": false,
31
+ "encoder_no_repeat_ngram_size": 0,
32
+ "eos_token_id": 151645,
33
+ "exponential_decay_length_penalty": null,
34
+ "finetuning_task": null,
35
+ "forced_bos_token_id": null,
36
+ "forced_eos_token_id": null,
37
+ "hidden_act": "silu",
38
+ "hidden_size": 3584,
39
+ "id2label": {
40
+ "0": "LABEL_0",
41
+ "1": "LABEL_1"
42
+ },
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 18944,
45
+ "is_decoder": false,
46
+ "is_encoder_decoder": false,
47
+ "label2id": {
48
+ "LABEL_0": 0,
49
+ "LABEL_1": 1
50
+ },
51
+ "length_penalty": 1.0,
52
+ "max_length": 20,
53
+ "max_position_embeddings": 32768,
54
+ "max_window_layers": 28,
55
+ "min_length": 0,
56
+ "model_max_length": 16384,
57
+ "model_type": "qwen2",
58
+ "no_repeat_ngram_size": 0,
59
+ "num_attention_heads": 28,
60
+ "num_beam_groups": 1,
61
+ "num_beams": 1,
62
+ "num_hidden_layers": 28,
63
+ "num_key_value_heads": 4,
64
+ "num_return_sequences": 1,
65
+ "output_attentions": false,
66
+ "output_hidden_states": false,
67
+ "output_scores": false,
68
+ "pad_token_id": null,
69
+ "prefix": null,
70
+ "problem_type": null,
71
+ "pruned_heads": {},
72
+ "remove_invalid_values": false,
73
+ "repetition_penalty": 1.0,
74
+ "return_dict": true,
75
+ "return_dict_in_generate": false,
76
+ "rms_norm_eps": 1e-06,
77
+ "rope_theta": 1000000.0,
78
+ "sep_token_id": null,
79
+ "sliding_window": null,
80
+ "suppress_tokens": null,
81
+ "task_specific_params": null,
82
+ "temperature": 1.0,
83
+ "tf_legacy_loss": false,
84
+ "tie_encoder_decoder": false,
85
+ "tie_word_embeddings": false,
86
+ "tokenizer_class": null,
87
+ "tokenizer_model_max_length": 16384,
88
+ "tokenizer_padding_side": "right",
89
+ "top_k": 50,
90
+ "top_p": 1.0,
91
+ "torch_dtype": "bfloat16",
92
+ "torchscript": false,
93
+ "typical_p": 1.0,
94
+ "use_bfloat16": false,
95
+ "use_cache": true,
96
+ "use_sliding_window": false,
97
+ "vocab_size": 152064
98
+ },
99
+ "max_position_embeddings": 32768,
100
+ "max_window_layers": 28,
101
+ "mm_connector_cfg": {
102
+ "add_cross_attention": false,
103
+ "architectures": [
104
+ "Connector"
105
+ ],
106
+ "attention_dropout": 0.0,
107
+ "bad_words_ids": null,
108
+ "begin_suppress_tokens": null,
109
+ "bos_token_id": null,
110
+ "chunk_size_feed_forward": 0,
111
+ "cross_attention_hidden_size": null,
112
+ "decoder_start_token_id": null,
113
+ "diversity_penalty": 0.0,
114
+ "do_sample": false,
115
+ "early_stopping": false,
116
+ "encoder_no_repeat_ngram_size": 0,
117
+ "eos_token_id": null,
118
+ "exponential_decay_length_penalty": null,
119
+ "ff_multi": 4,
120
+ "finetuning_task": null,
121
+ "forced_bos_token_id": null,
122
+ "forced_eos_token_id": null,
123
+ "hidden_act": "silu",
124
+ "id2label": {
125
+ "0": "LABEL_0",
126
+ "1": "LABEL_1"
127
+ },
128
+ "is_decoder": false,
129
+ "is_encoder_decoder": false,
130
+ "label2id": {
131
+ "LABEL_0": 0,
132
+ "LABEL_1": 1
133
+ },
134
+ "length_penalty": 1.0,
135
+ "max_length": 20,
136
+ "min_length": 0,
137
+ "model_type": "mm_connector",
138
+ "no_repeat_ngram_size": 0,
139
+ "num_beam_groups": 1,
140
+ "num_beams": 1,
141
+ "num_key_value_heads": 4,
142
+ "num_output_tokens": 128,
143
+ "num_patches": 24,
144
+ "num_return_sequences": 1,
145
+ "output_attentions": false,
146
+ "output_hidden_states": false,
147
+ "output_scores": false,
148
+ "pad_token_id": null,
149
+ "prefix": null,
150
+ "problem_type": null,
151
+ "projector_type": "mlp1x_gelu",
152
+ "pruned_heads": {},
153
+ "remove_invalid_values": false,
154
+ "repetition_penalty": 1.0,
155
+ "resampler_depth": 1,
156
+ "resampler_head_dim": 96,
157
+ "resampler_n_heads": 16,
158
+ "resampler_type": "perciver",
159
+ "return_dict": true,
160
+ "return_dict_in_generate": false,
161
+ "rms_norm_eps": 1e-06,
162
+ "sep_token_id": null,
163
+ "suppress_tokens": null,
164
+ "task_specific_params": null,
165
+ "temperature": 1.0,
166
+ "text_hidden_size": 3584,
167
+ "tf_legacy_loss": false,
168
+ "tie_encoder_decoder": false,
169
+ "tie_word_embeddings": true,
170
+ "token_input_shape": [
171
+ 4,
172
+ 27,
173
+ 27
174
+ ],
175
+ "tokenizer_class": null,
176
+ "top_k": 50,
177
+ "top_p": 1.0,
178
+ "torch_dtype": "bfloat16",
179
+ "torchscript": false,
180
+ "typical_p": 1.0,
181
+ "use_bfloat16": false,
182
+ "vision_hidden_size": 2560
183
+ },
184
+ "mm_connector_lr": 0.0001,
185
+ "mm_hidden_size": null,
186
+ "mm_vision_select_feature": "patch",
187
+ "mm_vision_select_layer": -2,
188
+ "model_dtype": "torch.bfloat16",
189
+ "model_type": "apollo",
190
+ "num_attention_heads": 28,
191
+ "num_encode_batch": 0,
192
+ "num_hidden_layers": 28,
193
+ "num_key_value_heads": 4,
194
+ "num_video_frames": null,
195
+ "resume_path": "./work_dirs/final_run/apollo-Qwen2.5-7B-Instruct-internvideo2-siglip-so400m-patch14-384-freeze-perciver_128_2-newprompt-ft",
196
+ "rms_norm_eps": 1e-06,
197
+ "rope_scaling": null,
198
+ "rope_theta": 1000000.0,
199
+ "s2": false,
200
+ "s2_max_split_size": 336,
201
+ "s2_scales": "336,672,1008",
202
+ "sliding_window": null,
203
+ "temporal_prompt": true,
204
+ "timestamp_prompt": true,
205
+ "transformers_version": "4.44.0",
206
+ "tune_language_model": true,
207
+ "tune_mm_connector": true,
208
+ "tune_vision_tower": false,
209
+ "use_cache": true,
210
+ "use_mm_patch_token": false,
211
+ "use_mm_start_end": false,
212
+ "use_sliding_window": false,
213
+ "vision_resolution": -1,
214
+ "vision_tower_cfg": {
215
+ "add_cross_attention": false,
216
+ "architectures": null,
217
+ "bad_words_ids": null,
218
+ "begin_suppress_tokens": null,
219
+ "bos_token_id": null,
220
+ "chunk_size_feed_forward": 0,
221
+ "configs": {},
222
+ "cross_attention_hidden_size": null,
223
+ "decoder_start_token_id": null,
224
+ "diversity_penalty": 0.0,
225
+ "do_sample": false,
226
+ "early_stopping": false,
227
+ "encoder_no_repeat_ngram_size": 0,
228
+ "eos_token_id": null,
229
+ "exponential_decay_length_penalty": null,
230
+ "finetuning_task": null,
231
+ "forced_bos_token_id": null,
232
+ "forced_eos_token_id": null,
233
+ "id2label": {
234
+ "0": "LABEL_0",
235
+ "1": "LABEL_1"
236
+ },
237
+ "is_decoder": false,
238
+ "is_encoder_decoder": false,
239
+ "label2id": {
240
+ "LABEL_0": 0,
241
+ "LABEL_1": 1
242
+ },
243
+ "length_penalty": 1.0,
244
+ "max_length": 20,
245
+ "min_length": 0,
246
+ "model_type": "hybrid_vision_tower",
247
+ "no_repeat_ngram_size": 0,
248
+ "num_beam_groups": 1,
249
+ "num_beams": 1,
250
+ "num_return_sequences": 1,
251
+ "num_vision_encoders": 2,
252
+ "output_attentions": false,
253
+ "output_hidden_states": false,
254
+ "output_scores": false,
255
+ "pad_token_id": null,
256
+ "prefix": null,
257
+ "problem_type": null,
258
+ "pruned_heads": {},
259
+ "remove_invalid_values": false,
260
+ "repetition_penalty": 1.0,
261
+ "return_dict": true,
262
+ "return_dict_in_generate": false,
263
+ "sep_token_id": null,
264
+ "suppress_tokens": null,
265
+ "task_specific_params": null,
266
+ "temperature": 1.0,
267
+ "tf_legacy_loss": false,
268
+ "tie_encoder_decoder": false,
269
+ "tie_word_embeddings": true,
270
+ "token_output_shape": [
271
+ 4,
272
+ 27,
273
+ 27
274
+ ],
275
+ "tokenizer_class": null,
276
+ "top_k": 50,
277
+ "top_p": 1.0,
278
+ "torch_dtype": null,
279
+ "torchscript": false,
280
+ "typical_p": 1.0,
281
+ "use_bfloat16": false,
282
+ "vision_towers": [
283
+ "siglip-so400m-patch14-384",
284
+ "internvideo2"
285
+ ]
286
+ },
287
+ "vocab_size": 152064,
288
+ "auto_map": {
289
+ "AutoConfig": "configuration_apollo.ApolloConfig",
290
+ "AutoModelForCausalLM": "modeling_apollo.ApolloForCausalLM"
291
+ },
292
+ "model_max_length": 16384
293
+ }
configuration_apollo.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #from transformers import PretrainedConfig
2
+ from transformers import PretrainedConfig
3
+
4
+
5
+ class ApolloConfig(PretrainedConfig):
6
+ model_type = "apollo"
7
+ def __init__(
8
+ self,
9
+ llm_cfg=None,
10
+ vision_tower_cfg=None,
11
+ mm_connector_cfg=None,
12
+ architectures=None,
13
+ resume_path=None,
14
+ image_aspect_ratio=None,
15
+ num_video_frames=None,
16
+ mm_vision_select_layer=None,
17
+ mm_vision_select_feature=None,
18
+ use_mm_start_end=False,
19
+ use_mm_patch_token=True,
20
+ mm_connector_lr=None,
21
+ vision_resolution=None,
22
+ interpolate_mode=None,
23
+ clip_duration=None,
24
+ vocab_size=None,
25
+ auto_map=None,
26
+ **kwargs
27
+ ):
28
+ super().__init__(**kwargs)
29
+
30
+ self.architectures = architectures
31
+ self.llm_cfg = llm_cfg
32
+ self.vision_tower_cfg = vision_tower_cfg
33
+ self.mm_connector_cfg = mm_connector_cfg
34
+ self.resume_path = resume_path
35
+ self.image_aspect_ratio = image_aspect_ratio
36
+ self.num_video_frames = num_video_frames
37
+ self.mm_vision_select_layer = mm_vision_select_layer
38
+ self.mm_vision_select_feature = mm_vision_select_feature
39
+ self.use_mm_start_end = use_mm_start_end
40
+ self.use_mm_patch_token = use_mm_patch_token
41
+ self.mm_connector_lr = mm_connector_lr
42
+ self.vision_resolution = vision_resolution
43
+ self.interpolate_mode = interpolate_mode
44
+ self.clip_duration = clip_duration
45
+ self.vocab_size=vocab_size
46
+ self.auto_map=auto_map
47
+
llm/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
llm/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./work_dirs/final_run/apollo-Qwen2.5-7B-Instruct-internvideo2-siglip-so400m-patch14-384-freeze-perciver_128_2-newprompt-ft/llm",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 3584,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 18944,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 28,
15
+ "model_max_length": 16384,
16
+ "model_type": "qwen2",
17
+ "num_attention_heads": 28,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 4,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_theta": 1000000.0,
22
+ "sliding_window": null,
23
+ "tie_word_embeddings": false,
24
+ "tokenizer_model_max_length": 16384,
25
+ "tokenizer_padding_side": "right",
26
+ "torch_dtype": "bfloat16",
27
+ "transformers_version": "4.44.0",
28
+ "use_cache": true,
29
+ "use_sliding_window": false,
30
+ "vocab_size": 152064
31
+ }
llm/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.44.0"
14
+ }
llm/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
llm/model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad28602d062e7ce6f51c4343652cef63168989c08ad1a47c11e64033c6c441ef
3
+ size 4877660776
llm/model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f392fe912c9e60fa81d1ceff9994a769f4a08f6bb63b6d92ce6ef26fbdb1704
3
+ size 4932751008
llm/model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dcd8091303478c06d62188b50f1a3af122ac7bc8d2396bfda7d7a4d4d56693ec
3
+ size 4330865200
llm/model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:576065d92cfe1cfc13576cd1327672d757ef36457f4fcba9e17f0ae90a4024b7
3
+ size 1089994880
llm/model.safetensors.index.json ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15231233024
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
296
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
308
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
320
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
327
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
332
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
333
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
344
+ "model.norm.weight": "model-00003-of-00004.safetensors"
345
+ }
346
+ }
llm/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
llm/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
llm/tokenizer_config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "legacy": false,
203
+ "model_max_length": 16384,
204
+ "pad_token": "<|endoftext|>",
205
+ "padding_side": "right",
206
+ "split_special_tokens": false,
207
+ "tokenizer_class": "Qwen2Tokenizer",
208
+ "unk_token": null
209
+ }
llm/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
mm_connector.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, math, torch
2
+ from collections import OrderedDict
3
+ from typing import Optional, Tuple
4
+
5
+ from torch import nn
6
+ from torch.nn.init import trunc_normal_, normal_
7
+ import torch.utils.checkpoint
8
+
9
+ from transformers import PreTrainedModel, PretrainedConfig, AutoConfig, AutoModel
10
+
11
+
12
+ class ClassInstantier(OrderedDict):
13
+ def __getitem__(self, key):
14
+ content = super().__getitem__(key)
15
+ cls, kwargs = content if isinstance(content, tuple) else (content, {})
16
+ return cls(**kwargs)
17
+
18
+
19
+ ACT2CLS = {"silu": nn.SiLU}
20
+
21
+ ACT2FN = ClassInstantier(ACT2CLS)
22
+
23
+
24
+ class WeightedNorm(nn.Module):
25
+ def __init__(self, hidden_size):
26
+ """
27
+ WeightedNorm
28
+ """
29
+ super().__init__()
30
+ self.hidden_size = hidden_size
31
+ self.norm = nn.LayerNorm(self.hidden_size)
32
+ self.wheight = nn.Parameter(torch.ones(self.hidden_size))
33
+ normal_(self.wheight, mean=1, std=.02)
34
+
35
+ def forward(self, x):
36
+ x = self.norm(x)
37
+ return x * self.wheight
38
+
39
+
40
+ class PerceiverMLP(nn.Module):
41
+ def __init__(
42
+ self,
43
+ hidden_size: int,
44
+ intermediate_size: int,
45
+ output_size: int,
46
+ hidden_act: str,
47
+ ):
48
+ super().__init__()
49
+ self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
50
+ self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
51
+ self.down_proj = nn.Linear(intermediate_size, output_size, bias=False)
52
+ self.act_fn = ACT2FN[hidden_act]
53
+
54
+ def forward(self, x):
55
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
56
+
57
+
58
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv
59
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
60
+ """
61
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
62
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
63
+ """
64
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
65
+ if n_rep == 1:
66
+ return hidden_states
67
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
68
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
69
+
70
+
71
+ class PerceiverAttention(nn.Module):
72
+ def __init__(self, connector_config, layer_idx: Optional[int] = None) -> None:
73
+ """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`"""
74
+ super().__init__()
75
+
76
+ self.layer_idx = None
77
+ self.hidden_size = connector_config.text_hidden_size
78
+ self.num_heads = connector_config.resampler_n_heads
79
+ self.head_dim = connector_config.resampler_head_dim
80
+ self.num_key_value_heads = connector_config.num_key_value_heads
81
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
82
+
83
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
84
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
85
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
86
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
87
+
88
+ self.is_causal = False
89
+
90
+ def forward(
91
+ self,
92
+ latents: torch.Tensor,
93
+ context: torch.Tensor,
94
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
95
+ output_attentions: bool = False,
96
+ use_cache: bool = False,
97
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
98
+ """
99
+ Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension!
100
+
101
+ Args:
102
+ latents (`torch.Tensor`): Tensor of shape [bsz, n_latents, embed_dim] representing fixed length latents to compress to.
103
+ context (`torch.Tensor`): Tensor of shape [bsz, seq, embed_dim] representing long-form context to resample.
104
+ output_attentions (`bool`, *optional*, defaults to `False`): Whether to return attention weights.
105
+ use_cache (`bool`, *optional*, defaults to `False`): Whether to use past_key_value for caching.
106
+ """
107
+ bsz, q_len, _ = latents.size()
108
+ kv_seq_len = q_len + context.size()[1]
109
+
110
+ hidden_states = torch.concat([context, latents], dim=-2)
111
+
112
+ query_states = self.q_proj(latents)
113
+ key_states = self.k_proj(hidden_states)
114
+ value_states = self.v_proj(hidden_states)
115
+
116
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
117
+ key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
118
+ value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
119
+
120
+ past_key_value = getattr(self, "past_key_value", past_key_value)
121
+
122
+ if past_key_value is not None:
123
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx)
124
+
125
+ # repeat k/v heads if n_kv_heads < n_heads
126
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
127
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
128
+
129
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
130
+
131
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
132
+ raise ValueError(
133
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
134
+ f" {attn_weights.size()}"
135
+ )
136
+
137
+ # upcast attention to fp32
138
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
139
+ attn_output = torch.matmul(attn_weights, value_states)
140
+
141
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
142
+ raise ValueError(
143
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
144
+ f" {attn_output.size()}"
145
+ )
146
+
147
+ attn_output = attn_output.transpose(1, 2).contiguous()
148
+ attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
149
+
150
+ attn_output = self.o_proj(attn_output)
151
+
152
+ if not output_attentions:
153
+ attn_weights = None
154
+
155
+ return attn_output, attn_weights, past_key_value
156
+
157
+
158
+ PERCEIVER_ATTENTION_CLASSES = {
159
+ "eager": PerceiverAttention,
160
+ }
161
+
162
+
163
+ class PerceiverLayer(nn.Module):
164
+ def __init__(self, connector_config, layer_idx: int):
165
+ super().__init__()
166
+ self.hidden_size = connector_config.text_hidden_size
167
+ self.n_latents = connector_config.num_output_tokens
168
+ self.depth = connector_config.resampler_depth
169
+ self.ff_multi = connector_config.ff_multi
170
+
171
+ self.input_latents_norm = WeightedNorm(self.hidden_size)
172
+ self.input_context_norm = WeightedNorm(self.hidden_size)
173
+ self.self_attn = PERCEIVER_ATTENTION_CLASSES[connector_config._attn_implementation](connector_config,
174
+ layer_idx=layer_idx)
175
+ self.post_attention_layernorm = WeightedNorm(self.hidden_size)
176
+ self.mlp = PerceiverMLP(
177
+ hidden_size=connector_config.text_hidden_size,
178
+ intermediate_size=connector_config.text_hidden_size * self.ff_multi,
179
+ output_size=connector_config.text_hidden_size,
180
+ hidden_act=connector_config.hidden_act,
181
+ )
182
+
183
+ def forward(
184
+ self,
185
+ latents: torch.Tensor,
186
+ context: torch.Tensor,
187
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
188
+ output_attentions: Optional[bool] = False,
189
+ use_cache: Optional[bool] = False,
190
+ **kwargs,
191
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
192
+ """
193
+ Args:
194
+ latents (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
195
+ context (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
196
+ output_attentions (`bool`, *optional*):
197
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
198
+ returned tensors for more detail.
199
+ use_cache (`bool`, *optional*):
200
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
201
+ (see `past_key_values`).
202
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
203
+ """
204
+ residual = latents
205
+
206
+ latents = self.input_latents_norm(latents)
207
+ context = self.input_context_norm(context)
208
+
209
+ latents, self_attn_weights, present_key_value = self.self_attn(
210
+ latents=latents,
211
+ context=context,
212
+ )
213
+
214
+ latents = residual + latents
215
+ residual = latents
216
+
217
+ latents = self.post_attention_layernorm(latents)
218
+ latents = self.mlp(latents)
219
+ latents = residual + latents
220
+
221
+ outputs = (latents,)
222
+
223
+ if output_attentions:
224
+ outputs += (self_attn_weights,)
225
+
226
+ if use_cache:
227
+ outputs += (present_key_value,)
228
+
229
+ return outputs
230
+
231
+
232
+ class PerceiverResampler(nn.Module):
233
+ """Perceiver Resampler that compresses input embeddings into a fixed number of latents."""
234
+
235
+ def __init__(self, connector_config) -> None:
236
+ super().__init__()
237
+ self.hidden_size = connector_config.text_hidden_size
238
+ self.hidden_act = connector_config.hidden_act
239
+ self.n_latents = connector_config.num_output_tokens
240
+ self.depth = connector_config.resampler_depth
241
+
242
+ # Create Latents for Perceiver
243
+ self.latents = nn.Parameter(torch.zeros(self.n_latents, self.hidden_size))
244
+
245
+ # Create Transformer Blocks
246
+ self.layers = nn.ModuleList([PerceiverLayer(connector_config, idx) for idx in range(self.depth)])
247
+ self.norm = WeightedNorm(self.hidden_size)
248
+ self._use_flash_attention_2 = connector_config._attn_implementation == "flash_attention_2"
249
+
250
+ def forward(
251
+ self,
252
+ context: torch.Tensor,
253
+ attention_mask: torch.Tensor = None,
254
+ ) -> torch.Tensor:
255
+ # seq embed -> bsz seq embed
256
+ latents = self.latents.unsqueeze(0).expand((context.shape[0], *self.latents.size()))
257
+
258
+ compressed_context = latents
259
+ for i, perceiver_layer in enumerate(self.layers):
260
+ layer_outputs = perceiver_layer(
261
+ compressed_context,
262
+ context,
263
+ past_key_value=None,
264
+ output_attentions=False,
265
+ use_cache=False,
266
+ )
267
+ compressed_context = layer_outputs[0]
268
+
269
+ compressed_context = self.norm(compressed_context)
270
+ return compressed_context
271
+
272
+
273
+ def build_mm_projector(
274
+ input_dim,
275
+ output_dim,
276
+ projector_type,
277
+ hidden_act='silu',
278
+ delay_load=False,
279
+ token_input_shape=0,
280
+ **kwargs
281
+ ) -> nn.Sequential:
282
+
283
+ modules = [nn.Linear(input_dim, output_dim)]
284
+ mlp_gelu_match = re.match(r'.*mlp(\d+)x_gelu$', projector_type)
285
+ if mlp_gelu_match is not None:
286
+ mlp_depth = int(mlp_gelu_match.group(1))
287
+ for _ in range(mlp_depth - 1):
288
+ modules.append(nn.GELU())
289
+ modules.append(nn.Linear(output_dim, output_dim))
290
+
291
+ return nn.Sequential(*modules)
292
+
293
+
294
+ class MMConnector(PreTrainedModel):
295
+ config_class = PretrainedConfig
296
+
297
+ def __init__(self, config: PretrainedConfig) -> None:
298
+ super().__init__(config)
299
+ self.proj = build_mm_projector(config.vision_hidden_size, config.text_hidden_size,
300
+ config.projector_type, token_input_shape=config.token_input_shape)
301
+ self.resampler = PerceiverResampler(config)
302
+
303
+ def forward(self, x):
304
+ x = self.proj(x)
305
+ x = self.resampler(x)
306
+ return x
mm_connector/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Connector"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "ff_multi": 4,
7
+ "hidden_act": "silu",
8
+ "model_type": "mm_connector",
9
+ "num_key_value_heads": 4,
10
+ "num_output_tokens": 128,
11
+ "num_patches": 24,
12
+ "projector_type": "mlp1x_gelu",
13
+ "resampler_depth": 1,
14
+ "resampler_head_dim": 96,
15
+ "resampler_n_heads": 16,
16
+ "resampler_type": "perciver",
17
+ "rms_norm_eps": 1e-06,
18
+ "text_hidden_size": 3584,
19
+ "token_input_shape": [
20
+ 4,
21
+ 27,
22
+ 27
23
+ ],
24
+ "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.44.0",
26
+ "vision_hidden_size": 2560,
27
+ "auto_map": {
28
+ "AutoConfig": "configuration_connector.ConnectorConfig"
29
+ }
30
+ }
mm_connector/configuration_connector.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from typing import Dict, List, Union
4
+ from transformers import PreTrainedModel, PretrainedConfig, AutoConfig, AutoModel
5
+ import torch.nn.functional as F
6
+ import json, os
7
+
8
+
9
+ class ConnectorConfig(PretrainedConfig):
10
+ model_type = "mm_connector"
11
+ def __init__(
12
+ self,
13
+ vision_hidden_size: List[int] = [],
14
+ text_hidden_size: int = 0,
15
+ num_patches: int = 24,
16
+ rms_norm_eps: float = 1e-4,
17
+ token_input_shape: List[int] = [],
18
+ **kwargs,
19
+ ):
20
+ super().__init__(**kwargs)
21
+ self.vision_hidden_size = vision_hidden_size
22
+ self.text_hidden_size = text_hidden_size
23
+ self.num_patches = num_patches
24
+ self.rms_norm_eps=rms_norm_eps
25
+ self.token_input_shape = token_input_shape
26
+
27
+ @classmethod
28
+ def load_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "ConnectorConfig":
29
+ cls._set_token_in_kwargs(kwargs)
30
+ config_dict, kwargs = cls.get_config_from_json(pretrained_model_name_or_path, **kwargs)
31
+ return cls.from_dict(config_dict, **kwargs)
32
+
33
+ @classmethod
34
+ def get_config_from_json(cls, config_file, **kwargs):
35
+ with open(config_file, 'r') as file:
36
+ config_data = json.load(file)
37
+ return config_data, kwargs
38
+
mm_connector/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2973ab0aaf61364d182eca589bdc80e28a815eb94112a83cb28d42d24da6156e
3
+ size 355169704
modeling_apollo.py ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple, Union
2
+ import warnings, os, torch
3
+ import torch.nn as nn
4
+
5
+ from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, AutoModelForCausalLM, AutoTokenizer
6
+ from transformers.modeling_utils import ContextManagers, no_init_weights
7
+ from transformers.modeling_outputs import CausalLMOutputWithPast
8
+ from transformers.generation.utils import GenerateOutput
9
+ from .configuration_apollo import ApolloConfig
10
+
11
+ from .vision_tower import ApolloVisionTower
12
+ from .mm_connector import MMConnector
13
+
14
+ IGNORE_INDEX = -100
15
+ X_TOKEN_INDEX = -200
16
+
17
+
18
+ def get_model_config(config):
19
+ default_keys = ["llm_cfg", "vision_tower_cfg", "mm_connector_cfg"]
20
+ if hasattr(config, "_name_or_path") and len(config._name_or_path) >= 2:
21
+ root_path = config._name_or_path
22
+ else:
23
+ root_path = config.resume_path
24
+
25
+ return_pths = []
26
+ for key in default_keys:
27
+ cfg = getattr(config, key, None)
28
+ if isinstance(cfg, dict):
29
+ try:
30
+ return_pths.append(os.path.join(root_path, key[:-4]))
31
+ except:
32
+ raise ValueError(f"Cannot find resume path in config for {key}!")
33
+ elif isinstance(cfg, PretrainedConfig):
34
+ return_pths.append(os.path.join(root_path, key[:-4]))
35
+ elif isinstance(cfg, str):
36
+ return_pths.append(cfg)
37
+
38
+ return_list = []
39
+ for pth in return_pths:
40
+ return_list.append(AutoConfig.from_pretrained(pth, trust_remote_code=True))
41
+
42
+ return return_list
43
+
44
+
45
+ def build_llm_and_tokenizer(
46
+ llm_cfg: str,
47
+ config: PretrainedConfig,
48
+ attn_implementation=None,
49
+ model_max_length=None,
50
+ *args,
51
+ **kwargs,
52
+ ) -> PreTrainedModel:
53
+ llm_arch = getattr(llm_cfg, "architectures")[0].lower()
54
+
55
+ llm_path = llm_cfg._name_or_path
56
+ llm = AutoModelForCausalLM.from_pretrained(
57
+ llm_path, config=llm_cfg, torch_dtype=eval(config.model_dtype), *args, **kwargs
58
+ )
59
+
60
+ tokenizer = AutoTokenizer.from_pretrained(
61
+ llm_path,
62
+ model_max_length=llm_cfg.model_max_length,
63
+ padding_side="right",
64
+ use_fast=False,
65
+ legacy=False,
66
+ **kwargs
67
+ )
68
+
69
+ #config.hidden_size = llm.config.hidden_size
70
+ return llm, tokenizer
71
+
72
+
73
+ class ApolloForCausalLM(PreTrainedModel):
74
+ def __init__(self, config: ApolloConfig, *args, **kwargs):
75
+ super().__init__(config)
76
+ llm_cfg, vision_tower_cfg, mm_connector_cfg = get_model_config(config)
77
+ model_dtype = getattr(config, "model_dtype", "torch.float16")
78
+ if not hasattr(config, "model_dtype"):
79
+ warnings.warn("model_dtype not found in config, defaulting to torch.float16.")
80
+ config.model_dtype = model_dtype
81
+ # Initialize weights and apply final processing
82
+
83
+ self.lm_head = nn.Linear(llm_cfg.hidden_size, config.vocab_size, bias=False)
84
+ self.vision_tower = ApolloVisionTower(config, vision_tower_cfg)
85
+ self.mm_connector = MMConnector.from_pretrained(mm_connector_cfg._name_or_path)
86
+ self.llm, self.tokenizer = build_llm_and_tokenizer(llm_cfg, config, *args, **kwargs)
87
+ self.post_init()
88
+ self.is_loaded = True
89
+
90
+ def forward(
91
+ self,
92
+ input_ids: torch.LongTensor = None,
93
+ attention_mask: Optional[torch.Tensor] = None,
94
+ position_ids: Optional[torch.LongTensor] = None,
95
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
96
+ inputs_embeds: Optional[torch.FloatTensor] = None,
97
+ labels: Optional[torch.LongTensor] = None,
98
+ use_cache: Optional[bool] = None,
99
+ output_attentions: Optional[bool] = None,
100
+ output_hidden_states: Optional[bool] = None,
101
+ vision_input: Optional[List[torch.FloatTensor]] = None,
102
+ data_types: Optional[List[str]] = None,
103
+ return_dict: Optional[bool] = None,
104
+ cache_position=None,
105
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
106
+
107
+ if inputs_embeds is None:
108
+ (
109
+ input_ids,
110
+ position_ids,
111
+ attention_mask,
112
+ past_key_values,
113
+ inputs_embeds,
114
+ labels
115
+ ) = self.prepare_inputs_labels_for_multimodal(
116
+ input_ids,
117
+ position_ids,
118
+ attention_mask,
119
+ past_key_values,
120
+ labels,
121
+ vision_input,
122
+ data_types
123
+ )
124
+
125
+ return self.get_llm().forward(
126
+ input_ids=input_ids,
127
+ attention_mask=attention_mask,
128
+ position_ids=position_ids,
129
+ past_key_values=past_key_values,
130
+ inputs_embeds=inputs_embeds,
131
+ labels=labels,
132
+ use_cache=use_cache,
133
+ output_attentions=output_attentions,
134
+ output_hidden_states=output_hidden_states,
135
+ return_dict=return_dict,
136
+ )
137
+
138
+ @torch.no_grad()
139
+ def generate(
140
+ self,
141
+ inputs: Optional[torch.Tensor] = None,
142
+ vision_input: Optional[List[torch.Tensor]] = None,
143
+ data_types: Optional[List[str]] = None,
144
+ **kwargs,
145
+ ) -> Union[GenerateOutput, torch.LongTensor]:
146
+ position_ids = kwargs.pop("position_ids", None)
147
+ attention_mask = kwargs.pop("attention_mask", None)
148
+ if "inputs_embeds" in kwargs:
149
+ raise NotImplementedError("`inputs_embeds` is not supported")
150
+
151
+ if vision_input is not None:
152
+ (inputs, position_ids, attention_mask, _, inputs_embeds, _) = self.prepare_inputs_labels_for_multimodal(
153
+ inputs, position_ids, attention_mask, None, None, vision_input, data_types=data_types)
154
+ else:
155
+ inputs_embeds = self.embed_tokens(inputs)
156
+
157
+ return self.get_llm().generate(position_ids=position_ids, attention_mask=attention_mask,
158
+ inputs_embeds=inputs_embeds, **kwargs)
159
+
160
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
161
+ vision_input = kwargs.pop("vision_input", None)
162
+ data_types = kwargs.pop("data_types", None)
163
+ inputs = self.get_llm().prepare_inputs_for_generation(input_ids, past_key_values=past_key_values,
164
+ inputs_embeds=inputs_embeds, **kwargs)
165
+ if vision_input is not None:
166
+ inputs["vision_input"] = vision_input
167
+ if data_types is not None:
168
+ inputs["data_types"] = data_types
169
+ return inputs
170
+
171
+ @classmethod
172
+ def from_pretrained(
173
+ cls,
174
+ pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
175
+ *model_args,
176
+ config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
177
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
178
+ ignore_mismatched_sizes: bool = False,
179
+ force_download: bool = False,
180
+ local_files_only: bool = False,
181
+ token: Optional[Union[str, bool]] = None,
182
+ revision: str = "main",
183
+ use_safetensors: bool = None,
184
+ **kwargs,
185
+ ):
186
+
187
+ return cls.load_pretrained(
188
+ pretrained_model_name_or_path,
189
+ *model_args,
190
+ config=config,
191
+ cache_dir=cache_dir,
192
+ ignore_mismatched_sizes=ignore_mismatched_sizes,
193
+ force_download=force_download,
194
+ local_files_only=local_files_only,
195
+ token=token,
196
+ revision=revision,
197
+ use_safetensors=use_safetensors,
198
+ **kwargs,
199
+ )
200
+
201
+ def get_llm(self):
202
+ return self.llm
203
+
204
+ def get_vision_tower(self):
205
+ return self.vision_tower
206
+
207
+ def get_mm_connector(self):
208
+ return self.mm_connector
209
+
210
+ @classmethod
211
+ def load_pretrained(cls, model_path_or_config, *args, **kwargs):
212
+ kwargs.pop("config", None)
213
+
214
+ if isinstance(model_path_or_config, str):
215
+ config = AutoConfig.from_pretrained(model_path_or_config, trust_remote_code=True, **kwargs)
216
+ elif isinstance(model_path_or_config, ApolloConfig):
217
+ config = model_path_or_config
218
+ else:
219
+ raise NotImplementedError(f"wrong type, {type(model_path_or_config)} \
220
+ {isinstance(model_path_or_config, ApolloConfig)}")
221
+
222
+ model_dtype = getattr(config, "model_dtype", "torch.float16")
223
+ if not hasattr(config, "model_dtype"):
224
+ warnings.warn("model_dtype not found in config, defaulting to torch.float16.")
225
+ config.model_dtype = model_dtype
226
+
227
+ with ContextManagers([no_init_weights(_enable=True), ]):
228
+ vlm = cls(config, *args, **kwargs)
229
+
230
+ if hasattr(vlm, "llm") and hasattr(vlm, "vision_tower") and hasattr(vlm, "mm_connector"):
231
+ if vlm.is_loaded:
232
+ return vlm
233
+ else:
234
+ print('loading model failed!')
235
+ else:
236
+ print('loading model failed!')
237
+
238
+ def _encode_mm(self, x):
239
+ x = self.get_vision_tower()(x)
240
+ x = self.mm_connector(x)
241
+ return x
242
+
243
+ def encode_mm_minibatch(self, x):
244
+ split_sizes = [x_s[0].shape[0] for x_s in x]
245
+ x = [torch.split(torch.cat([x_s[i] for x_s in x], dim=0), self.config.encode_batch_size) for i in
246
+ range(self.get_vision_tower().num_vision_encoders)]
247
+ swapped_x = []
248
+ for i in range(len(x[0])):
249
+ swapped_x.append([x_s[i] for x_s in x])
250
+
251
+ features = []
252
+ for xx in swapped_x:
253
+ xx = self._encode_mm(xx)
254
+ features.append(xx)
255
+ x = torch.cat(features, dim=0)
256
+ x = torch.split(x, split_sizes, dim=0)
257
+ return [xx.contiguous().view(-1, xx.shape[2]) for xx in x]
258
+
259
+ def prepare_inputs_labels_for_multimodal(
260
+ self, input_ids, position_ids, attention_mask, past_key_values, labels, vision_input, data_types
261
+ ):
262
+ vision_tower = self.get_vision_tower()
263
+ if vision_tower is None or vision_input is None or input_ids.shape[1] == 1:
264
+ if (
265
+ past_key_values is not None
266
+ and vision_tower is not None
267
+ and vision_input is not None
268
+ and input_ids.shape[1] == 1
269
+ ):
270
+ target_shape = past_key_values[-1][-1].shape[-2] + 1
271
+ attention_mask = torch.cat(
272
+ (
273
+ attention_mask,
274
+ torch.ones(
275
+ (
276
+ attention_mask.shape[0],
277
+ target_shape - attention_mask.shape[1],
278
+ ),
279
+ dtype=attention_mask.dtype,
280
+ device=attention_mask.device,
281
+ ),
282
+ ),
283
+ dim=1,
284
+ )
285
+ position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
286
+ return (
287
+ input_ids,
288
+ position_ids,
289
+ attention_mask,
290
+ past_key_values,
291
+ None,
292
+ labels,
293
+ )
294
+
295
+ '''
296
+ vision_input is a list of tuples, and data_type is a list of strings:
297
+ data_type = ['image', 'video', 'video'..., 'text']
298
+ (for one video and two image encoders)
299
+ vision_input =
300
+ [
301
+ [image(1, T, C, H, W), image(1, T, C, H, W), image(1, T, C, H, W)],
302
+ [video(Nc1, C, T, H, W), video(Nc1, T, C, H, W), video(Nc1, T, C, H, W)],
303
+ [video(Nc2, C, T, H, W), video(Nc2, T, C, H, W), video(Nc2, T, C, H, W)],
304
+ ]
305
+ -> video encoders typlically expect (C,T,H,W), images expect (C,H,W).
306
+ '''
307
+ # ====================================================================================================
308
+ merged_mm_features = self.encode_mm_minibatch(vision_input)
309
+
310
+ if not getattr(self.config, "tune_language_model", True) and getattr(self.config, "use_mm_start_end", False):
311
+ raise NotImplementedError
312
+ # ====================================================================================================
313
+ # Let's just add dummy tensors if they do not exist,
314
+ # it is a headache to deal with None all the time.
315
+ # But it is not ideal, and if you have a better idea,
316
+ # please open an issue / submit a PR, thanks.
317
+ _labels = labels
318
+ _position_ids = position_ids
319
+ _attention_mask = attention_mask
320
+ if attention_mask is None:
321
+ attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
322
+ else:
323
+ attention_mask = attention_mask.bool()
324
+ if position_ids is None:
325
+ position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
326
+ if labels is None:
327
+ labels = torch.full_like(input_ids, IGNORE_INDEX)
328
+
329
+ # remove the padding using attention_mask
330
+ input_ids_copy = input_ids.clone()
331
+ # kentang-mit@: Otherwise tokenizer out of bounds. Embeddings of image tokens will not be used.
332
+ input_ids_copy[input_ids_copy == X_TOKEN_INDEX] = 0
333
+ input_embeds = self.get_llm().model.embed_tokens(input_ids_copy)
334
+
335
+ input_ids = [
336
+ cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
337
+ ]
338
+ input_embeds_1 = [
339
+ cur_input_embeds[cur_attention_mask]
340
+ for cur_input_embeds, cur_attention_mask in zip(input_embeds, attention_mask)
341
+ ]
342
+ labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
343
+ # input_ids, new_input_embeds = self.inputs_merger(input_ids, input_embeds_1, merged_mm_features)
344
+ new_labels = []
345
+ new_input_embeds = []
346
+ # print("BEFORE BATCH LOOP:", len(input_ids), input_ids[0].shape, input_ids[0].device, [(x == X_TOKEN_INDEX).sum() for x in input_ids])
347
+ # kentang-mit@: If some part of the model is executed in the loop, the the loop length needs to be a constant.
348
+ for batch_idx, (cur_labels, cur_input_ids, mm_features) in enumerate(
349
+ zip(labels, input_ids, merged_mm_features)):
350
+ cur_input_ids = input_ids[batch_idx]
351
+ num_mm = (cur_input_ids == X_TOKEN_INDEX).sum()
352
+ if num_mm == 0:
353
+ cur_input_embeds_1 = input_embeds_1[batch_idx]
354
+ cur_input_embeds = torch.cat([cur_input_embeds_1, mm_features[0:0]], dim=0)
355
+ new_input_embeds.append(cur_input_embeds)
356
+ new_labels.append(cur_labels)
357
+ # kenang-mit@: we do not have placeholdr image for text-only data now.
358
+ continue
359
+
360
+ if mm_features.shape[0] != num_mm:
361
+ print(data_types[batch_idx])
362
+ assert num_mm == len(
363
+ mm_features), f'Error in {data_types[batch_idx]}{num_mm}=/={len(mm_features)} not the same number of vision tokens in and vision embeddings!'
364
+
365
+ cur_input_embeds = input_embeds_1[batch_idx]
366
+ image_token_indices = (
367
+ [-1] + torch.where(cur_input_ids == X_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
368
+ )
369
+ cur_input_ids_noim = []
370
+ cur_labels = labels[batch_idx]
371
+ cur_labels_noim = []
372
+ cur_input_embeds_no_im = []
373
+ for i in range(len(image_token_indices) - 1):
374
+ cur_input_ids_noim.append(cur_input_ids[image_token_indices[i] + 1: image_token_indices[i + 1]])
375
+ cur_labels_noim.append(cur_labels[image_token_indices[i] + 1: image_token_indices[i + 1]])
376
+ cur_input_embeds_no_im.append(cur_input_embeds[image_token_indices[i] + 1: image_token_indices[i + 1]])
377
+
378
+ cur_new_input_embeds = []
379
+ cur_new_labels = []
380
+ for i in range(num_mm + 1):
381
+ cur_new_input_embeds.append(cur_input_embeds_no_im[i])
382
+ # print("cur_new_input_embeds1", cur_new_input_embeds.shape[-1])
383
+ cur_new_labels.append(cur_labels_noim[i])
384
+ if i < num_mm:
385
+ cur_image_features = mm_features[i:i + 1]
386
+ cur_new_input_embeds.append(cur_image_features)
387
+ # print("cur_new_input_embeds2", cur_new_input_embeds.shape[-1])
388
+ cur_new_labels.append(
389
+ torch.full(
390
+ (cur_image_features.shape[0],),
391
+ IGNORE_INDEX,
392
+ device=cur_labels.device,
393
+ dtype=cur_labels.dtype,
394
+ )
395
+ )
396
+
397
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds)
398
+ cur_new_labels = torch.cat(cur_new_labels)
399
+
400
+ new_input_embeds.append(cur_new_input_embeds)
401
+ new_labels.append(cur_new_labels)
402
+
403
+ # Truncate sequences to max length as image embeddings can make the sequence longer
404
+ tokenizer_model_max_length = getattr(self.get_llm().config, "tokenizer_model_max_length", None)
405
+ if tokenizer_model_max_length is not None:
406
+ if any(len(x) > tokenizer_model_max_length for x in new_input_embeds):
407
+ priny("Inputs truncated!")
408
+ new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
409
+ new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
410
+ # Combine them
411
+ max_len = max(x.shape[0] for x in new_input_embeds)
412
+ batch_size = len(new_input_embeds)
413
+
414
+ new_input_embeds_padded = []
415
+ new_labels_padded = torch.full(
416
+ (batch_size, max_len),
417
+ IGNORE_INDEX,
418
+ dtype=new_labels[0].dtype,
419
+ device=new_labels[0].device,
420
+ )
421
+ attention_mask = torch.zeros(
422
+ (batch_size, max_len),
423
+ dtype=attention_mask.dtype,
424
+ device=attention_mask.device,
425
+ )
426
+ position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
427
+ for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
428
+ cur_len = cur_new_embed.shape[0]
429
+ if getattr(self.get_llm().config, "tokenizer_padding_side", "right") == "left":
430
+ new_input_embeds_padded.append(
431
+ torch.cat(
432
+ (
433
+ torch.zeros(
434
+ (max_len - cur_len, cur_new_embed.shape[1]),
435
+ dtype=cur_new_embed.dtype,
436
+ device=cur_new_embed.device,
437
+ ),
438
+ cur_new_embed,
439
+ ),
440
+ dim=0,
441
+ )
442
+ )
443
+ if cur_len > 0:
444
+ new_labels_padded[i, -cur_len:] = cur_new_labels
445
+ attention_mask[i, -cur_len:] = True
446
+ position_ids[i, -cur_len:] = torch.arange(
447
+ 0, cur_len, dtype=position_ids.dtype, device=position_ids.device
448
+ )
449
+ else:
450
+ new_input_embeds_padded.append(
451
+ torch.cat(
452
+ (
453
+ cur_new_embed,
454
+ torch.zeros(
455
+ (max_len - cur_len, cur_new_embed.shape[1]),
456
+ dtype=cur_new_embed.dtype,
457
+ device=cur_new_embed.device,
458
+ ),
459
+ ),
460
+ dim=0,
461
+ )
462
+ )
463
+ if cur_len > 0:
464
+ new_labels_padded[i, :cur_len] = cur_new_labels
465
+ attention_mask[i, :cur_len] = True
466
+ position_ids[i, :cur_len] = torch.arange(
467
+ 0, cur_len, dtype=position_ids.dtype, device=position_ids.device
468
+ )
469
+
470
+ new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
471
+
472
+ if _labels is None:
473
+ new_labels = None
474
+ else:
475
+ new_labels = new_labels_padded
476
+
477
+ if _attention_mask is None:
478
+ attention_mask = None
479
+ else:
480
+ attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
481
+
482
+ if _position_ids is None:
483
+ position_ids = None
484
+
485
+ return (
486
+ None,
487
+ position_ids,
488
+ attention_mask,
489
+ past_key_values,
490
+ new_input_embeds,
491
+ new_labels,
492
+ )
vision_tower.py ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, os, PIL, numbers
2
+ from PIL import Image
3
+ import cv2
4
+
5
+ from transformers.modeling_utils import PreTrainedModel
6
+ from transformers.models.siglip.modeling_siglip import SiglipVisionModel
7
+ from transformers import AutoConfig, AutoModel, SiglipImageProcessor, SiglipVisionConfig, PretrainedConfig
8
+ from typing import Union
9
+ import torch.nn.functional as F
10
+ import numpy as np
11
+
12
+
13
+ def crop_clip(clip, min_h, min_w, h, w):
14
+ if isinstance(clip[0], np.ndarray):
15
+ cropped = [img[min_h:min_h + h, min_w:min_w + w, :] for img in clip]
16
+
17
+ elif isinstance(clip[0], PIL.Image.Image):
18
+ cropped = [
19
+ img.crop((min_w, min_h, min_w + w, min_h + h)) for img in clip
20
+ ]
21
+ else:
22
+ raise TypeError('Expected numpy.ndarray or PIL.Image' +
23
+ 'but got list of {0}'.format(type(clip[0])))
24
+ return cropped
25
+
26
+
27
+ class Normalize(object):
28
+ """Normalize a clip with mean and standard deviation.
29
+ Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels, this transform
30
+ will normalize each channel of the input ``torch.*Tensor`` i.e.
31
+ ``input[channel] = (input[channel] - mean[channel]) / std[channel]``
32
+ .. note::
33
+ This transform acts out of place, i.e., it does not mutates the input tensor.
34
+ Args:
35
+ mean (sequence): Sequence of means for each channel.
36
+ std (sequence): Sequence of standard deviations for each channel.
37
+ """
38
+
39
+ def __init__(self, mean, std):
40
+ self.mean = mean
41
+ self.std = std
42
+
43
+ def __call__(self, clip):
44
+ """
45
+ Args:
46
+ clip (Tensor): Tensor clip of size (T, C, H, W) to be normalized.
47
+ Returns:
48
+ Tensor: Normalized Tensor clip.
49
+ """
50
+ return normalize(clip, self.mean, self.std)
51
+
52
+ def __repr__(self):
53
+ return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)
54
+
55
+
56
+ class CenterCrop(object):
57
+ """Extract center crop at the same location for a list of images
58
+ Args:
59
+ size (sequence or int): Desired output size for the
60
+ crop in format (h, w)
61
+ """
62
+
63
+ def __init__(self, size):
64
+ if isinstance(size, numbers.Number):
65
+ size = (size, size)
66
+
67
+ self.size = size
68
+
69
+ def __call__(self, clip):
70
+ """
71
+ Args:
72
+ img (PIL.Image or numpy.ndarray): List of images to be cropped
73
+ in format (h, w, c) in numpy.ndarray
74
+ Returns:
75
+ PIL.Image or numpy.ndarray: Cropped list of images
76
+ """
77
+ h, w = self.size
78
+ if isinstance(clip[0], np.ndarray):
79
+ im_h, im_w, im_c = clip[0].shape
80
+ elif isinstance(clip[0], PIL.Image.Image):
81
+ im_w, im_h = clip[0].size
82
+ else:
83
+ raise TypeError('Expected numpy.ndarray or PIL.Image' +
84
+ 'but got list of {0}'.format(type(clip[0])))
85
+ if w > im_w or h > im_h:
86
+ error_msg = (
87
+ 'Initial image size should be larger then '
88
+ 'cropped size but got cropped sizes : ({w}, {h}) while '
89
+ 'initial image is ({im_w}, {im_h})'.format(
90
+ im_w=im_w, im_h=im_h, w=w, h=h))
91
+ raise ValueError(error_msg)
92
+
93
+ x1 = int(round((im_w - w) / 2.))
94
+ y1 = int(round((im_h - h) / 2.))
95
+ cropped = crop_clip(clip, y1, x1, h, w)
96
+
97
+ return cropped
98
+
99
+
100
+ def resize_clip(clip, size, interpolation='bilinear'):
101
+ if isinstance(clip[0], np.ndarray):
102
+ if isinstance(size, numbers.Number):
103
+ im_h, im_w, im_c = clip[0].shape
104
+ # Min spatial dim already matches minimal size
105
+ if (im_w <= im_h and im_w == size) or (im_h <= im_w
106
+ and im_h == size):
107
+ return clip
108
+ new_h, new_w = get_resize_sizes(im_h, im_w, size)
109
+ size = (new_w, new_h)
110
+ else:
111
+ size = size[0], size[1]
112
+ if interpolation == 'bilinear':
113
+ np_inter = cv2.INTER_LINEAR
114
+ else:
115
+ np_inter = cv2.INTER_NEAREST
116
+ scaled = [
117
+ cv2.resize(img, size, interpolation=np_inter) for img in clip
118
+ ]
119
+ elif isinstance(clip[0], PIL.Image.Image):
120
+ if isinstance(size, numbers.Number):
121
+ im_w, im_h = clip[0].size
122
+ # Min spatial dim already matches minimal size
123
+ if (im_w <= im_h and im_w == size) or (im_h <= im_w
124
+ and im_h == size):
125
+ return clip
126
+ new_h, new_w = get_resize_sizes(im_h, im_w, size)
127
+ size = (new_w, new_h)
128
+ else:
129
+ size = size[1], size[0]
130
+ if interpolation == 'bilinear':
131
+ pil_inter = PIL.Image.BILINEAR
132
+ else:
133
+ pil_inter = PIL.Image.NEAREST
134
+ scaled = [img.resize(size, pil_inter) for img in clip]
135
+ else:
136
+ raise TypeError('Expected numpy.ndarray or PIL.Image' +
137
+ 'but got list of {0}'.format(type(clip[0])))
138
+ return scaled
139
+
140
+
141
+ def _is_tensor_clip(clip):
142
+ return torch.is_tensor(clip) and clip.ndimension() == 4
143
+
144
+
145
+ def get_resize_sizes(im_h, im_w, size):
146
+ if im_w < im_h:
147
+ ow = size
148
+ oh = int(size * im_h / im_w)
149
+ else:
150
+ oh = size
151
+ ow = int(size * im_w / im_h)
152
+ return oh, ow
153
+
154
+
155
+ def normalize(clip, mean, std, inplace=False):
156
+ if not _is_tensor_clip(clip):
157
+ raise TypeError('tensor is not a torch clip.')
158
+
159
+ if not inplace:
160
+ clip = clip.clone()
161
+
162
+ dtype = clip.dtype
163
+ mean = torch.as_tensor(mean, dtype=dtype, device=clip.device)
164
+ std = torch.as_tensor(std, dtype=dtype, device=clip.device)
165
+ clip.sub_(mean[:, None, None, None]).div_(std[:, None, None, None])
166
+
167
+ return clip
168
+
169
+
170
+ class Resize(object):
171
+ """Resizes a list of (H x W x C) numpy.ndarray to the final size
172
+ The larger the original image is, the more times it takes to
173
+ interpolate
174
+ Args:
175
+ interpolation (str): Can be one of 'nearest', 'bilinear'
176
+ defaults to nearest
177
+ size (tuple): (widht, height)
178
+ """
179
+
180
+ def __init__(self, size, interpolation='nearest'):
181
+ self.size = size
182
+ self.interpolation = interpolation
183
+
184
+ def __call__(self, clip):
185
+ resized = resize_clip(
186
+ clip, self.size, interpolation=self.interpolation)
187
+ return resized
188
+
189
+
190
+ class Compose(object):
191
+ """Composes several transforms
192
+ Args:
193
+ transforms (list of ``Transform`` objects): list of transforms
194
+ to compose
195
+ """
196
+
197
+ def __init__(self, transforms):
198
+ self.transforms = transforms
199
+
200
+ def __call__(self, clip):
201
+ for t in self.transforms:
202
+ clip = t(clip)
203
+ return clip
204
+
205
+
206
+ def convert_img(img):
207
+ """Converts (H, W, C) numpy.ndarray to (C, W, H) format"""
208
+ if len(img.shape) == 3:
209
+ img = img.transpose(2, 0, 1)
210
+ if len(img.shape) == 2:
211
+ img = np.expand_dims(img, 0)
212
+ return img
213
+
214
+
215
+ class ClipToTensor(object):
216
+ """Convert a list of m (H x W x C) numpy.ndarrays in the range [0, 255]
217
+ to a torch.FloatTensor of shape (C x m x H x W) in the range [0, 1.0]
218
+ """
219
+
220
+ def __init__(self, channel_nb=3, div_255=True, numpy=False):
221
+ self.channel_nb = channel_nb
222
+ self.div_255 = div_255
223
+ self.numpy = numpy
224
+
225
+ def __call__(self, clip):
226
+ """
227
+ Args: clip (list of numpy.ndarray): clip (list of images)
228
+ to be converted to tensor.
229
+ """
230
+ # Retrieve shape
231
+ if isinstance(clip[0], np.ndarray):
232
+ h, w, ch = clip[0].shape
233
+ assert ch == self.channel_nb, "Got {0} instead of 3 channels".format(ch)
234
+ elif isinstance(clip[0], Image.Image):
235
+ w, h = clip[0].size
236
+ else:
237
+ raise TypeError(
238
+ "Expected numpy.ndarray or PIL.Image\
239
+ but got list of {0}".format(
240
+ type(clip[0])
241
+ )
242
+ )
243
+
244
+ np_clip = np.zeros([self.channel_nb, len(clip), int(h), int(w)])
245
+
246
+ # Convert
247
+ for img_idx, img in enumerate(clip):
248
+ if isinstance(img, np.ndarray):
249
+ pass
250
+ elif isinstance(img, Image.Image):
251
+ img = np.array(img, copy=False)
252
+ else:
253
+ raise TypeError(
254
+ "Expected numpy.ndarray or PIL.Image\
255
+ but got list of {0}".format(
256
+ type(clip[0])
257
+ )
258
+ )
259
+ img = convert_img(img)
260
+ np_clip[:, img_idx, :, :] = img
261
+ if self.numpy:
262
+ if self.div_255:
263
+ np_clip = np_clip / 255.0
264
+ return np_clip
265
+
266
+ else:
267
+ tensor_clip = torch.from_numpy(np_clip)
268
+
269
+ if not isinstance(tensor_clip, torch.FloatTensor):
270
+ tensor_clip = tensor_clip.float()
271
+ if self.div_255:
272
+ tensor_clip = torch.div(tensor_clip, 255)
273
+ return tensor_clip
274
+
275
+
276
+ class VisionTowerConfig(PretrainedConfig):
277
+ model_type = "vision_tower"
278
+
279
+ def __init__(self, vision_tower_name: str = None, **kwargs):
280
+ super().__init__()
281
+ self.vision_tower_name = vision_tower_name
282
+
283
+
284
+ class ProcessorWrapper:
285
+ def __init__(self, transform=None, processor=None, height=378, width=378, frames_per_clip=1,
286
+ image_mean=[0.48145466, 0.4578275, 0.40821073]):
287
+ assert transform is not None or processor is not None, "ERROR: you did not define both `transform` and `processor`! You must define either transform or processor"
288
+ assert transform is None or processor is None, "ERROR: you did defined both `transform` and `processor`! You must define only one of: transform or processor"
289
+ self._size = {
290
+ "height": height,
291
+ "width": width,
292
+ "frames_per_clip": frames_per_clip
293
+ }
294
+ self._transforms = transform
295
+ self._processor = processor
296
+ self.image_mean = image_mean
297
+
298
+ @property
299
+ def size(self):
300
+ return self._size
301
+
302
+ def preprocess(self, image, return_tensors='pt'):
303
+ # Ensure image is a PIL Image
304
+ output = {}
305
+ if self._transforms is not None:
306
+ output['pixel_values'] = [self._transforms(image)]
307
+
308
+ else:
309
+ output = self._processor(image, return_tensors='pt')
310
+ return output
311
+
312
+ def save_pretrained(self, save_path):
313
+ if self._transforms is not None:
314
+ transform_dict = transform_to_dict(self._transforms)
315
+ transform_dict["image_processor_type"] = "transforms"
316
+ with open(os.path.join(save_path, 'preprocessor_config.json'), 'w') as f:
317
+ json.dump(transform_dict, f, indent=4)
318
+ else:
319
+ self._processor.save_pretrained(save_path)
320
+ return
321
+
322
+
323
+ class VisionTower(PreTrainedModel):
324
+ config_class = VisionTowerConfig
325
+
326
+ def __init__(self, model_name_or_path: str, config: PretrainedConfig, vision_config: VisionTowerConfig = None):
327
+ super().__init__(vision_config)
328
+ self.vision_tower_name = model_name_or_path
329
+ self.vision_config = vision_config
330
+ self.select_layer = getattr(config, "mm_vision_select_layer", -2)
331
+ self.select_feature = getattr(config, "mm_vision_select_feature", "patch")
332
+ self.encode_batch_size = getattr(config, "encode_batch_size", 0) // 2
333
+ self.num_encode_batch = getattr(config, "num_encode_batch", 0) // 2
334
+ self.temporal_tubelet_size = getattr(vision_config, "tubelet_size", 1)
335
+
336
+ def feature_select(self, image_features):
337
+ if self.select_layer is not None:
338
+ image_features = image_features.hidden_states[self.select_layer]
339
+
340
+ if self.select_feature == "patch":
341
+ image_features = image_features[:, 1:]
342
+ elif self.select_feature == "cls_patch":
343
+ image_features = image_features
344
+ else:
345
+ raise ValueError(f"Unexpected select feature: {self.select_feature}")
346
+
347
+ return image_features
348
+
349
+ def vision_tower_forward(self, image):
350
+ image_feature = self.vision_tower(image, output_hidden_states=True)
351
+ return image_feature
352
+
353
+ def _forward(self, images, out_T=1):
354
+ if type(images) is list:
355
+ image_features = []
356
+ for image in images:
357
+ image_feature = self.vision_tower_forward(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
358
+ image_feature = self.feature_select(image_feature).to(image.dtype)
359
+ image_feature = image_features.reshape(image_feature.shape[0], self.W, self.H, self.D)
360
+ image_features.append(image_feature)
361
+ else:
362
+ original_shape = images.shape
363
+ if len(original_shape) == 5 and self.T == 1:
364
+ # downsample temporally if needed, and reshape from (B, T, C, W, H) to (B*T, C, W, H).
365
+ images = images[:, ::original_shape[1] // out_T, ...]
366
+ original_shape = images.shape
367
+ images = images.view(-1, *original_shape[2:])
368
+
369
+ image_features = self.vision_tower_forward(images.to(device=self.device, dtype=self.dtype))
370
+ image_features = self.feature_select(image_features).to(images.dtype)
371
+ # Reshape back to (B, T, ...) if necessary
372
+ if len(original_shape) == 5 and self.T == 1:
373
+ # Assuming the feature dimension does not change, adapt the following line if it does
374
+ new_shape = list(image_features.shape[:-2]) + [self.W, self.H, self.hidden_size]
375
+ image_features = image_features.reshape(new_shape)
376
+ feature_size = image_features.shape[1:]
377
+ image_features = image_features.view(original_shape[0], original_shape[1], *feature_size)
378
+
379
+ else:
380
+ image_features = image_features.reshape(image_features.shape[0], self.T, self.W, self.H, self.hidden_size)
381
+
382
+ return image_features
383
+
384
+ def forward(self, images):
385
+ return self._forward(images)
386
+
387
+ @property
388
+ def dummy_feature(self):
389
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
390
+
391
+ @property
392
+ def dtype(self):
393
+ return self.vision_tower.dtype
394
+
395
+ @property
396
+ def device(self):
397
+ return self.vision_tower.device
398
+
399
+ @property
400
+ def num_patches(self):
401
+ return (self.config.image_size // self.config.patch_size) ** 2
402
+
403
+
404
+ class InternVideoTower(VisionTower):
405
+ def __init__(self, model_name_or_path: str, config: PretrainedConfig, vision_config: PretrainedConfig = None):
406
+ if vision_config is None:
407
+ vision_config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
408
+
409
+ super().__init__(model_name_or_path, config, vision_config)
410
+ self.vision_config = vision_config
411
+ normalize = ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
412
+
413
+ print('loading: ', model_name_or_path)
414
+ model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)
415
+ self.vision_tower = model.to(dtype=eval(config.model_dtype))
416
+
417
+ transform = Compose([
418
+ Resize(self.vision_config.img_size, interpolation='bilinear'),
419
+ CenterCrop(size=(self.vision_config.img_size, self.vision_config.img_size)),
420
+ ClipToTensor(),
421
+ Normalize(mean=normalize[0], std=normalize[1])
422
+ ])
423
+
424
+ self.vision_processor = ProcessorWrapper(transform=transform,
425
+ height=self.vision_config.img_size,
426
+ width=self.vision_config.img_size,
427
+ frames_per_clip=self.vision_config.num_frames,
428
+ image_mean=normalize[0])
429
+
430
+ self.W = self.H = vision_config.img_size // vision_config.patch_size
431
+ self.T = self.vision_config.num_frames // self.vision_config.tubelet_size
432
+ self.num_frames = self.vision_config.num_frames
433
+ self.hidden_size = vision_config.d_model
434
+ self.vision_select_layer=self.select_layer
435
+ self.select_layer=None
436
+
437
+ def vision_tower_forward(self, video):
438
+ if video.shape[-3] < self.num_frames:
439
+ video = video.repeat_interleave(self.num_frames, dim=-3)
440
+ elif video.shape[-3] > self.num_frames:
441
+ video = video[:, :, ::video.shape[-3] // self.num_frames, ...]
442
+
443
+ video_feature = self.vision_tower(video.to(device=self.device, dtype=self.dtype),
444
+ x_vis_return_idx=self.vision_select_layer, x_vis_only=True)
445
+
446
+ return video_feature
447
+
448
+ @property
449
+ def device(self):
450
+ return self.vision_tower.pos_embed.device
451
+
452
+
453
+ class SiglipVisionTower(VisionTower):
454
+ def __init__(self, model_name_or_path: str, config: PretrainedConfig, vision_config: PretrainedConfig = None):
455
+ if vision_config is None:
456
+ vision_config = SiglipVisionConfig.from_pretrained(model_name_or_path)
457
+
458
+ super().__init__(model_name_or_path, config, vision_config)
459
+ self.vision_config = vision_config
460
+ self.vision_tower_name = model_name_or_path
461
+ self.vision_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
462
+
463
+ print('loading: ', model_name_or_path)
464
+ self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
465
+
466
+ self.hidden_size = self.vision_config.hidden_size
467
+ self.W = self.H = self.vision_config.image_size // self.vision_config.patch_size
468
+ self.T = 1
469
+ self.select_feature = "cls_patch"
470
+
471
+
472
+ class ApolloVisionTower(PreTrainedModel):
473
+ def __init__(self, config, vision_tower_cfg):
474
+ super(ApolloVisionTower, self).__init__(config, vision_tower_cfg)
475
+ self.model_name_or_path = vision_tower_cfg._name_or_path
476
+ self.vision_towers = vision_tower_cfg.vision_towers
477
+ self._config = vision_tower_cfg
478
+
479
+ for vision_tower_name in self.vision_towers:
480
+ if 'internvideo' in vision_tower_name.lower():
481
+ vision_tower = InternVideoTower(os.path.join(vision_tower_cfg._name_or_path, vision_tower_name), config)
482
+ elif 'siglip' in vision_tower_name.lower():
483
+ vision_tower = SiglipVisionTower(os.path.join(vision_tower_cfg._name_or_path, vision_tower_name),
484
+ config)
485
+
486
+ setattr(self, vision_tower_name, vision_tower)
487
+
488
+ self.vision_processor = [getattr(self, vt).vision_processor for vt in self.vision_towers]
489
+ self.num_vision_encoders = len(self.vision_towers)
490
+ self.W = self.H = max([getattr(self, vt).W for vt in self.vision_towers])
491
+ self.T = max([getattr(self, vt).T for vt in self.vision_towers])
492
+ self.max_tubelet_size = max(
493
+ [getattr(getattr(self, vt).vision_config, 'tubelet_size', 1) for vt in self.vision_towers])
494
+
495
+ self._hidden_size = sum([getattr(self, vt).hidden_size for vt in self.vision_towers])
496
+ self.token_output_shape = (self.T, self.W, self.H)
497
+ self.config.num_vision_encoders = self.num_vision_encoders
498
+ self.config.vision_towers = self.vision_towers
499
+ self.config.token_output_shape = self.token_output_shape
500
+
501
+ def forward(self, x):
502
+ output_features = []
503
+ for x_s, vision_tower_name in zip(x, self.vision_towers):
504
+ vision_tower = getattr(self, vision_tower_name)
505
+ features = vision_tower._forward(x_s, out_T=self.T)
506
+
507
+ if len(features.shape) != len(self.token_output_shape) + 2:
508
+ features = features.unsqueeze(1)
509
+
510
+ if features.shape[-len(self.token_output_shape) - 1:-1] != self.token_output_shape:
511
+ features = features.permute(0, 4, 1, 2, 3).contiguous() # shape [B, D, T, W, H]
512
+ features = F.interpolate(features.to(torch.float32), size=self.token_output_shape, mode='trilinear',
513
+ align_corners=False).to(features.dtype)
514
+ features = features.permute(0, 2, 3, 4, 1).contiguous()
515
+
516
+ output_features.append(features)
517
+
518
+ output_features = torch.cat(output_features, dim=-1)
519
+ output_features = torch.flatten(output_features, start_dim=1, end_dim=-2)
520
+ return output_features
521
+
522
+ def save_pretrained(
523
+ self,
524
+ save_directory: Union[str, os.PathLike],
525
+ state_dict=None,
526
+ **kwargs,
527
+ ):
528
+ if state_dict is None:
529
+ state_dict = self.state_dict()
530
+
531
+ for vision_tower_name in self.vision_towers:
532
+ vision_tower = getattr(self, vision_tower_name)
533
+ vision_tower_state_dict = OrderedDict(
534
+ {k.split(f"vision_tower.{vision_tower_name}.vision_tower.")[-1]: v for k, v in state_dict.items() if
535
+ vision_tower_name in k}
536
+ )
537
+ vision_tower.vision_tower.save_pretrained(os.path.join(save_directory, vision_tower_name),
538
+ state_dict=vision_tower_state_dict, **kwargs)
539
+ vision_tower.vision_processor.save_pretrained(os.path.join(save_directory, vision_tower_name))
540
+
541
+ config = self.config
542
+ config.configs = {}
543
+ config.save_pretrained(save_directory)
544
+
545
+ @property
546
+ def patch_size(self):
547
+ return self._patch_size
548
+
549
+ @property
550
+ def image_size(self):
551
+ return self._image_size
552
+
553
+ @property
554
+ def hidden_size(self):
555
+ return self._hidden_size
556
+
vision_tower/config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "configs": {},
3
+ "model_type": "hybrid_vision_tower",
4
+ "num_vision_encoders": 2,
5
+ "token_output_shape": [
6
+ 4,
7
+ 27,
8
+ 27
9
+ ],
10
+ "transformers_version": "4.44.0",
11
+ "vision_towers": [
12
+ "siglip-so400m-patch14-384",
13
+ "internvideo2"
14
+ ],
15
+ "auto_map": {
16
+ "AutoConfig": "configuration_hybrid.HybridTowerConfig"
17
+ }
18
+ }
vision_tower/configuration_hybrid.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
5
+ import os
6
+ import torch.nn.functional as F
7
+ from transformers.modeling_utils import PreTrainedModel
8
+ from transformers.configuration_utils import PretrainedConfig
9
+ from transformers import AutoConfig
10
+ from collections import OrderedDict
11
+
12
+
13
+ class HybridTowerConfig(PretrainedConfig):
14
+ model_type = "hybrid_vision_tower"
15
+
16
+ def __init__(self, configs=None, **kwargs):
17
+ """
18
+ Initializes the HybridTowerConfig.
19
+
20
+ Args:
21
+ configs (dict, optional): A dictionary where keys are component names and values are
22
+ instances of configurations that have a `to_dict()` method.
23
+ **kwargs: Additional keyword arguments that are passed to the superclass.
24
+ """
25
+ super().__init__(**kwargs)
26
+ self.configs = {}
27
+
28
+ if configs is not None:
29
+ if not isinstance(configs, dict):
30
+ raise TypeError("configs must be a dictionary where keys are component names and values are configuration objects.")
31
+
32
+ for component_name, config in configs.items():
33
+ if hasattr(config, 'to_dict'):
34
+ self.configs[component_name] = config.to_dict()
35
+ else:
36
+ raise TypeError(f"The configuration for '{component_name}' does not have a to_dict() method and cannot be serialized.")
37
+
38
+ def to_dict(self):
39
+ """
40
+ Serializes this instance to a Python dictionary.
41
+
42
+ Returns:
43
+ dict: A dictionary containing all the keys and values of this configuration instance.
44
+ """
45
+ config_dict = super().to_dict()
46
+ config_dict['configs'] = self.configs
47
+ return config_dict
48
+
vision_tower/internvideo2/config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "PretrainInternVideo2"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.0,
6
+ "attn_pool_num_heads": 16,
7
+ "checkpoint_num": 40,
8
+ "clip_embed_dim": 768,
9
+ "clip_input_resolution": 224,
10
+ "clip_norm_type": "l2",
11
+ "clip_return_layer": 6,
12
+ "clip_student_return_interval": 1,
13
+ "clip_teacher": null,
14
+ "clip_teacher_embed_dim": 3200,
15
+ "clip_teacher_final_dim": 768,
16
+ "clip_teacher_return_interval": 1,
17
+ "d_model": 1408,
18
+ "encoder_stride": 16,
19
+ "hidden_act": "gelu",
20
+ "hidden_dropout_prob": 0.0,
21
+ "hidden_size": 768,
22
+ "image_mask_ratio": 0.5,
23
+ "image_mask_type": "random",
24
+ "image_size": 224,
25
+ "img_size": 224,
26
+ "initializer_range": 0.02,
27
+ "intermediate_size": 3072,
28
+ "keep_temporal": false,
29
+ "layer_norm_eps": 1e-12,
30
+ "model_type": "internvideo2",
31
+ "name": "pretrain_internvideo2_1b_patch14_224",
32
+ "num_attention_heads": 12,
33
+ "num_channels": 3,
34
+ "num_frames": 4,
35
+ "num_heads": 16,
36
+ "num_hidden_layers": 12,
37
+ "only_mask": true,
38
+ "patch_size": 14,
39
+ "qkv_bias": false,
40
+ "sep_image_video_pos_embed": true,
41
+ "torch_dtype": "bfloat16",
42
+ "transformers_version": "4.44.0",
43
+ "tubelet_size": 1,
44
+ "use_checkpoint": true,
45
+ "use_flash_attn": false,
46
+ "use_fused_mlp": false,
47
+ "use_fused_rmsnorm": false,
48
+ "video_mask_ratio": 0.8,
49
+ "video_mask_type": "random",
50
+ "auto_map": {
51
+ "AutoConfig": "configuration_internvideo2.InternVideo2Config",
52
+ "AutoModel": "modeling_internvideo2.InternVideo2Model"
53
+ }
54
+ }
vision_tower/internvideo2/configuration_internvideo2.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class InternVideo2Config(PretrainedConfig):
5
+ model_type = "internvideo2"
6
+
7
+ def __init__(
8
+ self,
9
+ img_size=224,
10
+ patch_size=14,
11
+ tubelet_size=1,
12
+ num_frames=8,
13
+ d_model=1408,
14
+ num_heads=16,
15
+ depth=40,
16
+ mlp_ratio=48 / 11,
17
+ qkv_bias=False,
18
+ init_values=1e-5,
19
+ use_checkpoint=False,
20
+ checkpoint_num=0,
21
+ use_flash_attn=False,
22
+ use_fused_mlp=False,
23
+ use_fused_rmsnorm=False,
24
+ qk_normalization=True,
25
+ clip_embed_dim=1408,
26
+ attn_pool_num_heads=16,
27
+ clip_teacher_embed_dim=512,
28
+ clip_teacher_final_dim=512,
29
+ clip_student_return_interval=4,
30
+ clip_return_layer=3,
31
+ clip_norm_type="l2",
32
+ sep_image_video_pos_embed=False,
33
+ **kwargs,
34
+ ):
35
+ """
36
+ This is the configuration class to store the configuration of a `InternVideo2Model`.
37
+ It is used to instantiate a InternVideo2 model according to the specified arguments,
38
+ defining the model architecture.
39
+
40
+ Args:
41
+ img_size (int, optional): Input image size. Defaults to 224.
42
+ patch_size (int, optional): Size of each patch. Defaults to 14.
43
+ tubelet_size (int, optional): Temporal tubelet size. Defaults to 1.
44
+ num_frames (int, optional): Number of frames in the video input. Defaults to 8.
45
+ d_model (int, optional): Dimension of the model embeddings. Defaults to 1408.
46
+ num_heads (int, optional): Number of attention heads. Defaults to 16.
47
+ depth (int, optional): Number of transformer encoder layers. Defaults to 40.
48
+ mlp_ratio (float, optional): Ratio of MLP hidden dim to embedding dim. Defaults to 48/11.
49
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Defaults to False.
50
+ init_values (float, optional): Initial values for layer scale. Defaults to 1e-5.
51
+ use_checkpoint (bool, optional): Whether to use gradient checkpointing. Defaults to False.
52
+ checkpoint_num (int, optional): Number of layers to apply checkpointing. Defaults to 0.
53
+ use_flash_attn (bool, optional): Whether to use FlashAttention. Defaults to False.
54
+ use_fused_mlp (bool, optional): Whether to use fused MLP. Defaults to False.
55
+ use_fused_rmsnorm (bool, optional): Whether to use fused RMSNorm. Defaults to False.
56
+ qk_normalization (bool, optional): Whether to apply QK normalization. Defaults to True.
57
+ clip_embed_dim (int, optional): Embedding dimension for CLIP. Defaults to 1408.
58
+ attn_pool_num_heads (int, optional): Number of heads for attention pooling. Defaults to 16.
59
+ clip_teacher_embed_dim (int, optional): Embedding dimension for CLIP teacher model. Defaults to 512.
60
+ clip_teacher_final_dim (int, optional): Final embedding dimension for CLIP teacher model. Defaults to 512.
61
+ clip_student_return_interval (int, optional): Interval for returning student layers. Defaults to 4.
62
+ clip_return_layer (int, optional): Number of layers to return for alignment. Defaults to 3.
63
+ clip_norm_type (str, optional): Normalization type for CLIP ('l2' or 'none'). Defaults to 'l2'.
64
+ sep_image_video_pos_embed (bool, optional): Whether to use separate position embeddings for image and video. Defaults to False.
65
+ **kwargs: Additional keyword arguments.
66
+ """
67
+ super().__init__(**kwargs)
68
+ self.img_size = img_size
69
+ self.patch_size = patch_size
70
+ self.tubelet_size = tubelet_size
71
+ self.num_frames = num_frames
72
+ self.d_model = d_model
73
+ self.num_heads = num_heads
74
+ self.depth = depth
75
+ self.mlp_ratio = mlp_ratio
76
+ self.qkv_bias = qkv_bias
77
+ self.init_values = init_values
78
+ self.use_checkpoint = use_checkpoint
79
+ self.checkpoint_num = checkpoint_num
80
+ self.use_flash_attn = use_flash_attn
81
+ self.use_fused_mlp = use_fused_mlp
82
+ self.use_fused_rmsnorm = use_fused_rmsnorm
83
+ self.qk_normalization = qk_normalization
84
+ self.clip_embed_dim = clip_embed_dim
85
+ self.attn_pool_num_heads = attn_pool_num_heads
86
+ self.clip_teacher_embed_dim = clip_teacher_embed_dim
87
+ self.clip_teacher_final_dim = clip_teacher_final_dim
88
+ self.clip_student_return_interval = clip_student_return_interval
89
+ self.clip_return_layer = clip_return_layer
90
+ self.clip_norm_type = clip_norm_type
91
+ self.sep_image_video_pos_embed = sep_image_video_pos_embed
vision_tower/internvideo2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc98ff193aca2992ed2f56fea01c4d7be2e2c737cf9fcc5a73ef31663e728624
3
+ size 2098289968
vision_tower/internvideo2/modeling_internvideo2.py ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modeling_internvideo2.py
2
+
3
+ import logging
4
+ import math
5
+ import numpy as np
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+
10
+ from transformers import PreTrainedModel
11
+ from transformers.utils import logging as hf_logging
12
+
13
+ from torch.utils.checkpoint import checkpoint # Correct
14
+
15
+ from functools import partial
16
+
17
+ from .configuration_internvideo2 import InternVideo2Config # Import the configuration
18
+
19
+ try:
20
+ from einops import rearrange
21
+ except ImportError:
22
+ raise ImportError("Please install einops to use this model.")
23
+
24
+ try:
25
+ from timm.models.layers import DropPath, to_2tuple
26
+ except ImportError:
27
+ raise ImportError("Please install timm to use this model.")
28
+
29
+ logger = hf_logging.get_logger(__name__)
30
+
31
+ # Position embedding functions
32
+ def get_3d_sincos_pos_embed(embed_dim, grid_size, t_size, cls_token=False):
33
+ assert embed_dim % 4 == 0
34
+ embed_dim_spatial = embed_dim // 4 * 3
35
+ embed_dim_temporal = embed_dim // 4
36
+
37
+ # Spatial
38
+ grid_h = np.arange(grid_size, dtype=np.float32)
39
+ grid_w = np.arange(grid_size, dtype=np.float32)
40
+ grid = np.meshgrid(grid_w, grid_h) # W first
41
+ grid = np.stack(grid, axis=0)
42
+
43
+ grid = grid.reshape([2, 1, grid_size, grid_size])
44
+ pos_embed_spatial = get_2d_sincos_pos_embed_from_grid(embed_dim_spatial, grid)
45
+
46
+ # Temporal
47
+ grid_t = np.arange(t_size, dtype=np.float32)
48
+ pos_embed_temporal = get_1d_sincos_pos_embed_from_grid(embed_dim_temporal, grid_t)
49
+
50
+ # Combine spatial and temporal embeddings
51
+ pos_embed_temporal = pos_embed_temporal[:, np.newaxis, :]
52
+ pos_embed_temporal = np.repeat(pos_embed_temporal, grid_size**2, axis=1)
53
+ pos_embed_spatial = pos_embed_spatial[np.newaxis, :, :]
54
+ pos_embed_spatial = np.repeat(pos_embed_spatial, t_size, axis=0)
55
+
56
+ pos_embed = np.concatenate([pos_embed_temporal, pos_embed_spatial], axis=-1)
57
+ pos_embed = pos_embed.reshape([-1, embed_dim])
58
+
59
+ if cls_token:
60
+ pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
61
+ return pos_embed
62
+
63
+ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
64
+ assert embed_dim % 2 == 0
65
+
66
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])
67
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])
68
+
69
+ emb = np.concatenate([emb_h, emb_w], axis=1)
70
+ return emb
71
+
72
+ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
73
+ assert embed_dim % 2 == 0
74
+ omega = np.arange(embed_dim // 2, dtype=np.float32)
75
+ omega /= embed_dim / 2.0
76
+ omega = 1.0 / (10000 ** omega)
77
+
78
+ pos = pos.reshape(-1)
79
+ out = np.einsum('m,d->md', pos, omega)
80
+
81
+ emb_sin = np.sin(out)
82
+ emb_cos = np.cos(out)
83
+
84
+ emb = np.concatenate([emb_sin, emb_cos], axis=1)
85
+ return emb
86
+
87
+ # Define necessary classes: CrossAttention, AttentiveBlock, AttentionPoolingBlock, RMSNorm, LayerScale, Attention, Mlp, Block, PatchEmbed, Linear_Decoder
88
+
89
+
90
+ class CrossAttention(nn.Module):
91
+ def __init__(
92
+ self,
93
+ dim,
94
+ num_heads=8,
95
+ qkv_bias=False,
96
+ qk_scale=None,
97
+ attn_drop=0.0,
98
+ proj_drop=0.0,
99
+ attn_head_dim=None,
100
+ out_dim=None,
101
+ ):
102
+ super().__init__()
103
+ if out_dim is None:
104
+ out_dim = dim
105
+ self.num_heads = num_heads
106
+ head_dim = dim // num_heads
107
+ if attn_head_dim is not None:
108
+ head_dim = attn_head_dim
109
+ all_head_dim = head_dim * self.num_heads
110
+ self.scale = qk_scale or head_dim ** -0.5
111
+ assert all_head_dim == dim
112
+
113
+ self.q = nn.Linear(dim, all_head_dim, bias=False)
114
+ self.k = nn.Linear(dim, all_head_dim, bias=False)
115
+ self.v = nn.Linear(dim, all_head_dim, bias=False)
116
+
117
+ if qkv_bias:
118
+ self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
119
+ self.k_bias = nn.Parameter(torch.zeros(all_head_dim))
120
+ self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
121
+ else:
122
+ self.q_bias = None
123
+ self.k_bias = None
124
+ self.v_bias = None
125
+
126
+ self.attn_drop = nn.Dropout(attn_drop)
127
+ self.proj = nn.Linear(all_head_dim, out_dim)
128
+ self.proj_drop = nn.Dropout(proj_drop)
129
+
130
+ def forward(self, x, k=None, v=None):
131
+ B, N, C = x.shape
132
+ N_k = k.shape[1]
133
+ N_v = v.shape[1]
134
+
135
+ q_bias, k_bias, v_bias = None, None, None
136
+ if self.q_bias is not None:
137
+ q_bias = self.q_bias
138
+ k_bias = self.k_bias
139
+ v_bias = self.v_bias
140
+
141
+ q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
142
+ q = (
143
+ q.reshape(B, N, 1, self.num_heads, -1)
144
+ .permute(2, 0, 3, 1, 4)
145
+ .squeeze(0)
146
+ ) # (B, N_head, N_q, dim)
147
+
148
+ k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
149
+ k = (
150
+ k.reshape(B, N_k, 1, self.num_heads, -1)
151
+ .permute(2, 0, 3, 1, 4)
152
+ .squeeze(0)
153
+ )
154
+
155
+ v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
156
+ v = (
157
+ v.reshape(B, N_v, 1, self.num_heads, -1)
158
+ .permute(2, 0, 3, 1, 4)
159
+ .squeeze(0)
160
+ )
161
+
162
+ q = q * self.scale
163
+ attn = q @ k.transpose(-2, -1) # (B, N_head, N_q, N_k)
164
+
165
+ attn = attn.softmax(dim=-1)
166
+ attn = self.attn_drop(attn)
167
+
168
+ x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
169
+ x = self.proj(x)
170
+ x = self.proj_drop(x)
171
+
172
+ return x
173
+
174
+
175
+ class AttentiveBlock(nn.Module):
176
+ def __init__(
177
+ self,
178
+ dim,
179
+ num_heads,
180
+ qkv_bias=False,
181
+ qk_scale=None,
182
+ drop=0.0,
183
+ attn_drop=0.0,
184
+ drop_path=0.0,
185
+ norm_layer=nn.LayerNorm,
186
+ attn_head_dim=None,
187
+ out_dim=None,
188
+ ):
189
+ super().__init__()
190
+
191
+ self.norm1_q = norm_layer(dim)
192
+ self.norm1_k = norm_layer(dim)
193
+ self.norm1_v = norm_layer(dim)
194
+ self.cross_attn = CrossAttention(
195
+ dim,
196
+ num_heads=num_heads,
197
+ qkv_bias=qkv_bias,
198
+ qk_scale=qk_scale,
199
+ attn_drop=attn_drop,
200
+ proj_drop=drop,
201
+ attn_head_dim=attn_head_dim,
202
+ out_dim=out_dim,
203
+ )
204
+
205
+ self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
206
+
207
+ def forward(
208
+ self, x_q, x_kv, pos_q, pos_k, bool_masked_pos, rel_pos_bias=None
209
+ ):
210
+ x_q = self.norm1_q(x_q + pos_q)
211
+ x_k = self.norm1_k(x_kv + pos_k)
212
+ x_v = self.norm1_v(x_kv)
213
+ x = self.cross_attn(x_q, k=x_k, v=x_v)
214
+
215
+ return x
216
+
217
+
218
+ class AttentionPoolingBlock(AttentiveBlock):
219
+ def forward(self, x):
220
+ x_q = x.mean(1, keepdim=True)
221
+ x_kv, pos_q, pos_k = x, 0, 0
222
+ x = super().forward(
223
+ x_q, x_kv, pos_q, pos_k, bool_masked_pos=None, rel_pos_bias=None
224
+ )
225
+ x = x.squeeze(1)
226
+ return x
227
+
228
+
229
+ class RMSNorm(nn.Module):
230
+ def __init__(self, hidden_size, eps=1e-6):
231
+ super().__init__()
232
+ self.weight = nn.Parameter(torch.ones(hidden_size))
233
+ self.variance_epsilon = eps
234
+
235
+ def forward(self, hidden_states):
236
+ input_dtype = hidden_states.dtype
237
+ hidden_states = hidden_states.to(torch.float32)
238
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
239
+ hidden_states = hidden_states * torch.rsqrt(
240
+ variance + self.variance_epsilon
241
+ )
242
+ return self.weight * hidden_states.to(input_dtype)
243
+
244
+
245
+ class LayerScale(nn.Module):
246
+ def __init__(
247
+ self, dim, init_values=1e-5, inplace=False, force_fp32=False
248
+ ):
249
+ super().__init__()
250
+ self.inplace = inplace
251
+ self.weight = nn.Parameter(init_values * torch.ones(dim))
252
+ self.force_fp32 = force_fp32
253
+
254
+ @torch.cuda.amp.autocast(enabled=False)
255
+ def forward(self, x):
256
+ if self.force_fp32:
257
+ output_type = x.dtype
258
+ out = (
259
+ x.float().mul_(self.weight.float())
260
+ if self.inplace
261
+ else x.float() * self.weight.float()
262
+ )
263
+ return out.to(dtype=output_type)
264
+ else:
265
+ out = x.mul_(self.weight) if self.inplace else x * self.weight
266
+ return out
267
+
268
+
269
+ class Attention(nn.Module):
270
+ def __init__(
271
+ self,
272
+ dim,
273
+ num_heads=8,
274
+ qkv_bias=False,
275
+ attn_drop=0.0,
276
+ proj_drop=0.0,
277
+ use_flash_attn=False,
278
+ causal=False,
279
+ norm_layer=nn.LayerNorm,
280
+ qk_normalization=False,
281
+ use_fused_rmsnorm=False,
282
+ ):
283
+ super().__init__()
284
+ assert (
285
+ dim % num_heads == 0
286
+ ), "dim should be divisible by num_heads"
287
+ self.num_heads = num_heads
288
+ head_dim = dim // num_heads
289
+ self.scale = head_dim ** -0.5
290
+
291
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
292
+ self.attn_drop = nn.Dropout(attn_drop)
293
+ self.proj = nn.Linear(dim, dim)
294
+ self.proj_drop = nn.Dropout(proj_drop)
295
+
296
+ self.use_flash_attn = use_flash_attn
297
+ if use_flash_attn:
298
+ self.causal = causal
299
+ try:
300
+ from flash_attn.flash_attention import FlashAttention
301
+
302
+ self.inner_attn = FlashAttention(
303
+ attention_dropout=attn_drop
304
+ )
305
+ except ImportError:
306
+ raise ImportError(
307
+ "Please install flash_attn to use flash attention."
308
+ )
309
+
310
+ self.qk_normalization = qk_normalization
311
+ self.q_norm = norm_layer(dim) if qk_normalization else nn.Identity()
312
+ self.k_norm = norm_layer(dim) if qk_normalization else nn.Identity()
313
+ self.use_fused_rmsnorm = use_fused_rmsnorm
314
+
315
+ def _naive_attn(self, x):
316
+ B, N, C = x.shape
317
+ # print(x.shape, torch.cuda.memory_allocated(), torch.cuda.memory_allocated())
318
+ qkv = (
319
+ self.qkv(x)
320
+ .reshape(B, N, 3, self.num_heads, C // self.num_heads)
321
+ .permute(2, 0, 3, 1, 4)
322
+ )
323
+ q, k, v = qkv.unbind(
324
+ 0
325
+ ) # make torchscript happy (cannot use tensor as tuple)
326
+
327
+ if self.qk_normalization:
328
+ B_, H_, N_, D_ = q.shape
329
+ q = (
330
+ self.q_norm(q.transpose(1, 2).flatten(-2, -1))
331
+ .view(B_, N_, H_, D_)
332
+ .transpose(1, 2)
333
+ )
334
+ k = (
335
+ self.k_norm(k.transpose(1, 2).flatten(-2, -1))
336
+ .view(B_, N_, H_, D_)
337
+ .transpose(1, 2)
338
+ )
339
+
340
+ attn = (q * self.scale) @ k.transpose(-2, -1)
341
+ # attn = attn - attn.max(-1)[0].unsqueeze(-1) # in case of overflow for fp16
342
+ attn = attn.softmax(dim=-1)
343
+ attn = self.attn_drop(attn)
344
+ # print(torch.cuda.memory_allocated(), torch.cuda.memory_allocated())
345
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
346
+ x = self.proj(x)
347
+ x = self.proj_drop(x)
348
+ return x
349
+
350
+ def _flash_attn(
351
+ self, x, key_padding_mask=None, need_weights=False
352
+ ):
353
+ qkv = self.qkv(x)
354
+ qkv = rearrange(
355
+ qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads
356
+ )
357
+
358
+ if self.qk_normalization:
359
+ q, k, v = qkv.unbind(2)
360
+ if self.use_fused_rmsnorm:
361
+ q = self.q_norm(q.flatten(-2, -1))[0].view(q.shape)
362
+ k = self.k_norm(k.flatten(-2, -1))[0].view(k.shape)
363
+ else:
364
+ q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
365
+ k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
366
+ qkv = torch.stack([q, k, v], dim=2)
367
+
368
+ context, _ = self.inner_attn(
369
+ qkv,
370
+ key_padding_mask=key_padding_mask,
371
+ need_weights=need_weights,
372
+ causal=self.causal,
373
+ )
374
+ outs = self.proj(rearrange(context, "b s h d -> b s (h d)"))
375
+ outs = self.proj_drop(outs)
376
+ return outs
377
+
378
+ def forward(self, x):
379
+ x = (
380
+ self._naive_attn(x)
381
+ if not self.use_flash_attn
382
+ else self._flash_attn(x)
383
+ )
384
+ return x
385
+
386
+
387
+ class Mlp(nn.Module):
388
+ """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
389
+
390
+ def __init__(
391
+ self,
392
+ in_features,
393
+ hidden_features=None,
394
+ out_features=None,
395
+ act_layer=nn.GELU,
396
+ bias=True,
397
+ drop=0.0,
398
+ ):
399
+ super().__init__()
400
+ out_features = out_features or in_features
401
+ hidden_features = hidden_features or in_features
402
+ bias = to_2tuple(bias)
403
+ drop_probs = to_2tuple(drop)
404
+
405
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0])
406
+ self.act = act_layer()
407
+ self.drop1 = nn.Dropout(drop_probs[0])
408
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1])
409
+ self.drop2 = nn.Dropout(drop_probs[1])
410
+
411
+ def forward(self, x):
412
+ x = self.fc1(x)
413
+ x = self.act(x)
414
+ x = self.drop1(x)
415
+ x = self.fc2(x)
416
+ x = self.drop2(x)
417
+ return x
418
+
419
+
420
+ class Block(nn.Module):
421
+ def __init__(
422
+ self,
423
+ dim,
424
+ num_heads,
425
+ mlp_ratio=4.0,
426
+ qkv_bias=False,
427
+ drop=0.0,
428
+ attn_drop=0.0,
429
+ init_values=None,
430
+ drop_path=0.0,
431
+ act_layer=nn.GELU,
432
+ norm_layer=nn.LayerNorm,
433
+ use_flash_attn=False,
434
+ use_fused_mlp=False,
435
+ fused_mlp_heuristic=1,
436
+ with_cp=False,
437
+ qk_normalization=False,
438
+ layerscale_no_force_fp32=False,
439
+ use_fused_rmsnorm=False,
440
+ ):
441
+ super().__init__()
442
+
443
+ self.norm1 = norm_layer(dim)
444
+ self.attn = Attention(
445
+ dim,
446
+ num_heads=num_heads,
447
+ qkv_bias=qkv_bias,
448
+ attn_drop=attn_drop,
449
+ proj_drop=drop,
450
+ use_flash_attn=use_flash_attn,
451
+ causal=False,
452
+ norm_layer=norm_layer,
453
+ qk_normalization=qk_normalization,
454
+ use_fused_rmsnorm=use_fused_rmsnorm,
455
+ )
456
+ self.ls1 = (
457
+ LayerScale(
458
+ dim,
459
+ init_values=init_values,
460
+ force_fp32=(not layerscale_no_force_fp32),
461
+ )
462
+ if init_values
463
+ else nn.Identity()
464
+ )
465
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
466
+ self.drop_path1 = (
467
+ DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
468
+ )
469
+
470
+ self.norm2 = norm_layer(dim)
471
+ mlp_hidden_dim = int(dim * mlp_ratio)
472
+ if use_fused_mlp:
473
+ try:
474
+ from flash_attn.modules.mlp import FusedMLP
475
+ except ImportError:
476
+ raise ImportError(
477
+ "Please install flash_attn to use fused MLP."
478
+ )
479
+ self.mlp = FusedMLP(
480
+ in_features=dim,
481
+ hidden_features=mlp_hidden_dim,
482
+ heuristic=fused_mlp_heuristic,
483
+ )
484
+ else:
485
+ self.mlp = Mlp(
486
+ in_features=dim,
487
+ hidden_features=mlp_hidden_dim,
488
+ act_layer=act_layer,
489
+ drop=drop,
490
+ )
491
+ self.ls2 = (
492
+ LayerScale(
493
+ dim,
494
+ init_values=init_values,
495
+ force_fp32=(not layerscale_no_force_fp32),
496
+ )
497
+ if init_values
498
+ else nn.Identity()
499
+ )
500
+ self.drop_path2 = (
501
+ DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
502
+ )
503
+
504
+ self.with_cp = with_cp
505
+ self.use_fused_rmsnorm = use_fused_rmsnorm
506
+
507
+ def forward(self, x, residual=None):
508
+ def _inner_forward(x, residual=None):
509
+ if self.use_fused_rmsnorm:
510
+ x, residual = self.norm1(x, residual)
511
+ x = self.drop_path1(self.ls1(self.attn(x)))
512
+ x, residual = self.norm2(x, residual)
513
+ x = self.drop_path2(self.ls2(self.mlp(x)))
514
+ return x, residual
515
+ else:
516
+ assert residual is None
517
+ x = x + self.drop_path1(
518
+ self.ls1(self.attn(self.norm1(x)))
519
+ )
520
+ x = x + self.drop_path2(
521
+ self.ls2(self.mlp(self.norm2(x)))
522
+ )
523
+ return x
524
+
525
+ if self.with_cp:
526
+ return checkpoint(_inner_forward, x, residual)
527
+ else:
528
+ return _inner_forward(x, residual=residual)
529
+
530
+
531
+ class PatchEmbed(nn.Module):
532
+ """3D Image to Patch Embedding"""
533
+
534
+ def __init__(
535
+ self,
536
+ img_size=224,
537
+ patch_size=16,
538
+ in_chans=3,
539
+ embed_dim=768,
540
+ num_frames=8,
541
+ tubelet_size=1,
542
+ norm_layer=None,
543
+ ):
544
+ super().__init__()
545
+ img_size = to_2tuple(img_size)
546
+ patch_size = to_2tuple(patch_size)
547
+ self.img_size = img_size
548
+ self.patch_size = patch_size
549
+ self.grid_size = (
550
+ num_frames // tubelet_size,
551
+ img_size[0] // patch_size[0],
552
+ img_size[1] // patch_size[1],
553
+ ) # (T, H, W)
554
+ self.num_patches = (
555
+ self.grid_size[0] * self.grid_size[1] * self.grid_size[2]
556
+ )
557
+ self.num_img_patches = self.grid_size[1] * self.grid_size[2]
558
+
559
+ self.proj = nn.Conv3d(
560
+ in_channels=in_chans,
561
+ out_channels=embed_dim,
562
+ kernel_size=(tubelet_size, patch_size[0], patch_size[1]),
563
+ stride=(tubelet_size, patch_size[0], patch_size[1]),
564
+ )
565
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
566
+
567
+ def forward(self, x):
568
+ x = self.proj(x)
569
+ x = (
570
+ x.flatten(3)
571
+ .permute(0, 2, 3, 1)
572
+ ) # B x C x T x HW => B x T x HW x C
573
+ x = self.norm(x)
574
+ return x
575
+
576
+
577
+
578
+ class Linear_Decoder(nn.Module):
579
+ def __init__(self, in_channels=1408, out_channels=3200, norm_layer=nn.LayerNorm, clip_norm_type='l2'):
580
+ super().__init__()
581
+ self.clip_norm_type = clip_norm_type
582
+ logger.info(f'Normalization Type: {clip_norm_type}')
583
+
584
+ self.head = nn.Linear(in_channels, out_channels)
585
+ self.norm = norm_layer(out_channels)
586
+
587
+ def forward(self, x):
588
+ x = self.norm(self.head(x))
589
+
590
+ if self.clip_norm_type == 'l2':
591
+ x = x / x.norm(dim=-1, keepdim=True)
592
+ elif self.clip_norm_type == 'none':
593
+ pass
594
+ else:
595
+ raise NotImplementedError
596
+
597
+ return x
598
+
599
+ class InternVideo2Model(PreTrainedModel):
600
+ config_class = InternVideo2Config
601
+ base_model_prefix = "internvideo2"
602
+
603
+ def __init__(self, config: InternVideo2Config):
604
+ super().__init__(config)
605
+
606
+ in_chans = 3
607
+ drop_path_rate = 0.25
608
+ qk_normalization = config.qk_normalization
609
+ clip_embed_dim = config.clip_embed_dim
610
+ num_heads = config.num_heads
611
+ qkv_bias = config.qkv_bias
612
+ init_values = config.init_values
613
+ mlp_ratio = config.mlp_ratio
614
+ depth = config.depth
615
+ num_frames = config.num_frames
616
+ self.num_frames = num_frames
617
+ self.tubelet_size = config.tubelet_size
618
+ use_fused_mlp = config.use_fused_mlp
619
+ use_fused_rmsnorm = config.use_fused_rmsnorm
620
+ use_flash_attn = config.use_flash_attn
621
+ assert (
622
+ use_flash_attn
623
+ == use_fused_rmsnorm
624
+ == use_fused_mlp
625
+ ), "use_flash_attn, use_fused_rmsnorm and use_fused_mlp should be consistent"
626
+
627
+ self.use_flash_attn = use_flash_attn
628
+ embed_dim = config.d_model
629
+ self.embed_dim = embed_dim
630
+
631
+ self.depth = depth
632
+ self.clip_norm_type = config.clip_norm_type
633
+ self.return_index = []
634
+ for i in range(config.clip_return_layer):
635
+ self.return_index.append(
636
+ depth - int(i * config.clip_student_return_interval) - 1
637
+ )
638
+ logger.info(f"Normalization Type: {config.clip_norm_type}")
639
+ logger.info(f"Student Return Index: {self.return_index}")
640
+
641
+ if use_fused_rmsnorm:
642
+ try:
643
+ from flash_attn.ops.rms_norm import DropoutAddRMSNorm
644
+ except ImportError:
645
+ raise ImportError(
646
+ "Please install flash_attn to use fused RMSNorm."
647
+ )
648
+ norm_layer_for_blocks = partial(
649
+ DropoutAddRMSNorm, eps=1e-6, prenorm=True
650
+ )
651
+ else:
652
+ norm_layer_for_blocks = partial(RMSNorm, eps=1e-6)
653
+ self.norm_layer_for_blocks = norm_layer_for_blocks
654
+ self.patch_embed = PatchEmbed(
655
+ config.img_size,
656
+ config.patch_size,
657
+ in_chans,
658
+ embed_dim,
659
+ num_frames=num_frames,
660
+ tubelet_size=self.tubelet_size,
661
+ )
662
+ num_patches = self.patch_embed.num_patches
663
+ num_img_patches = self.patch_embed.num_img_patches
664
+
665
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
666
+
667
+ self.sep_pos_embed = False
668
+ self.sep_image_video_pos_embed = config.sep_image_video_pos_embed
669
+ if self.sep_pos_embed:
670
+ raise NotImplementedError
671
+ else:
672
+ if self.sep_image_video_pos_embed:
673
+ logger.info(
674
+ "Use joint position embedding, for image and video we use different pos_embed."
675
+ )
676
+ self.pos_embed = nn.Parameter(
677
+ torch.zeros(1, num_patches + 1, embed_dim)
678
+ )
679
+ self.img_pos_embed = nn.Parameter(
680
+ torch.zeros(1, num_img_patches + 1, embed_dim)
681
+ )
682
+ # for CLIP decoder
683
+ self.clip_pos_embed = nn.Parameter(
684
+ torch.zeros(1, num_patches + 1, embed_dim)
685
+ )
686
+ self.clip_img_pos_embed = nn.Parameter(
687
+ torch.zeros(1, num_img_patches + 1, embed_dim)
688
+ )
689
+ else:
690
+ logger.info(
691
+ "Use joint position embedding, for image and video we use same pos_embed."
692
+ )
693
+ self.pos_embed = nn.Parameter(
694
+ torch.zeros(1, num_patches + 1, embed_dim)
695
+ )
696
+ self.clip_pos_embed = nn.Parameter(
697
+ torch.zeros(1, num_patches + 1, embed_dim)
698
+ )
699
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
700
+ # choose which layer to use checkpoint
701
+ with_cp_list = [False] * depth
702
+ if config.use_checkpoint:
703
+ for idx in range(depth):
704
+ if idx < config.checkpoint_num:
705
+ with_cp_list[idx] = True
706
+ logger.info(f"Droppath rate: {dpr}")
707
+ logger.info(f"Checkpoint list: {with_cp_list}")
708
+
709
+ self.blocks = nn.ModuleList(
710
+ [
711
+ Block(
712
+ embed_dim,
713
+ num_heads,
714
+ mlp_ratio,
715
+ qkv_bias=qkv_bias,
716
+ norm_layer=norm_layer_for_blocks,
717
+ drop_path=dpr[i],
718
+ init_values=init_values,
719
+ attn_drop=0.0,
720
+ use_flash_attn=use_flash_attn,
721
+ use_fused_mlp=use_fused_mlp,
722
+ fused_mlp_heuristic=1,
723
+ with_cp=with_cp_list[i],
724
+ qk_normalization=qk_normalization,
725
+ layerscale_no_force_fp32=False,
726
+ use_fused_rmsnorm=use_fused_rmsnorm,
727
+ )
728
+ for i in range(depth)
729
+ ]
730
+ )
731
+ self.clip_projector = AttentionPoolingBlock(
732
+ dim=embed_dim,
733
+ num_heads=config.attn_pool_num_heads,
734
+ qkv_bias=True,
735
+ qk_scale=None,
736
+ drop=0.0,
737
+ attn_drop=0.0,
738
+ norm_layer=partial(nn.LayerNorm, eps=1e-5),
739
+ out_dim=clip_embed_dim,
740
+ )
741
+
742
+ # CLIP decoder
743
+ self.clip_decoder = nn.ModuleList(
744
+ [
745
+ Linear_Decoder(
746
+ in_channels=embed_dim,
747
+ out_channels=config.clip_teacher_embed_dim,
748
+ norm_layer=partial(nn.LayerNorm, eps=1e-5),
749
+ clip_norm_type=config.clip_norm_type,
750
+ )
751
+ for _ in range(config.clip_return_layer)
752
+ ]
753
+ )
754
+ self.final_clip_decoder = nn.Identity()
755
+ if config.clip_teacher_final_dim > 0:
756
+ self.final_clip_decoder = Linear_Decoder(
757
+ in_channels=clip_embed_dim,
758
+ out_channels=config.clip_teacher_final_dim,
759
+ norm_layer=partial(nn.LayerNorm, eps=1e-5),
760
+ clip_norm_type=config.clip_norm_type,
761
+ )
762
+
763
+ # Removed initialization methods and code
764
+
765
+ @property
766
+ def dtype(self):
767
+ return self.patch_embed.proj.weight.dtype
768
+
769
+ def get_num_layers(self):
770
+ return len(self.blocks)
771
+
772
+ @torch.jit.ignore
773
+ def no_weight_decay(self):
774
+ return {
775
+ "pos_embed",
776
+ "pos_embed_spatial",
777
+ "pos_embed_temporal",
778
+ "pos_embed_cls",
779
+ "img_pos_embed",
780
+ "cls_token",
781
+ "clip_pos_embed",
782
+ "clip_pos_embed_spatial",
783
+ "clip_pos_embed_temporal",
784
+ "clip_pos_embed_cls",
785
+ "clip_img_pos_embed",
786
+ }
787
+
788
+ def forward(
789
+ self,
790
+ x,
791
+ mask=None,
792
+ use_image=False,
793
+ x_vis_return_idx=-1,
794
+ x_vis_only=False,
795
+ ):
796
+ x = self.patch_embed(x.type(self.dtype))
797
+ B, T, L, C = x.shape
798
+ x = x.view([B, T * L, C])
799
+
800
+ # Append cls token
801
+ cls_tokens = self.cls_token.expand(B, -1, -1)
802
+ x = torch.cat((cls_tokens, x), dim=1)
803
+
804
+ # Add positional embeddings
805
+ if self.sep_pos_embed:
806
+ raise NotImplementedError
807
+ else:
808
+ if use_image:
809
+ if self.sep_image_video_pos_embed:
810
+ pos_embed = self.img_pos_embed
811
+ else:
812
+ cls_pos_embed = self.pos_embed[:, 0:1, :]
813
+ img_pos_embed = (
814
+ self.pos_embed[:, 1:, :]
815
+ .view(
816
+ 1,
817
+ self.num_frames,
818
+ self.patch_embed.num_patches // self.num_frames,
819
+ self.embed_dim,
820
+ )
821
+ .mean(dim=1)
822
+ )
823
+ pos_embed = torch.cat(
824
+ [cls_pos_embed, img_pos_embed], dim=1
825
+ )
826
+ else:
827
+ pos_embed = self.pos_embed
828
+ x = x + pos_embed
829
+
830
+ # Mask tokens
831
+ if mask is not None:
832
+ x = x[~mask].reshape(B, -1, C)
833
+ else:
834
+ x = x.reshape(B, -1, C)
835
+
836
+ residual = None
837
+ x_clip = []
838
+ for idx, blk in enumerate(self.blocks):
839
+ if isinstance(x, tuple) and len(x) == 2:
840
+ x, residual = x
841
+ x = blk(x, residual=residual)
842
+ # Return intermediate features
843
+ if idx in self.return_index:
844
+ if isinstance(x, tuple) and len(x) == 2:
845
+ tmp_x, tmp_residual = x
846
+ if residual is not None:
847
+ x_clip.append(tmp_x + tmp_residual)
848
+ else:
849
+ x_clip.append(x)
850
+ if idx == (self.depth + x_vis_return_idx):
851
+ break
852
+
853
+ if isinstance(x, tuple) and len(x) == 2:
854
+ x, residual = x
855
+ if residual is not None:
856
+ x = x + residual
857
+
858
+ x_vis = x
859
+ if x_vis_only:
860
+ return x_vis
861
+
862
+ x_pool_vis = self.clip_projector(x_vis)
863
+ x_align = self.final_clip_decoder(x_pool_vis)
864
+
865
+ # Align CLIP
866
+ x_clip = torch.stack(x_clip)
867
+ K, B, _, C_CLIP = x_clip.shape
868
+ # Add positional embeddings
869
+ if self.sep_pos_embed:
870
+ raise NotImplementedError
871
+ else:
872
+ if use_image:
873
+ if self.sep_image_video_pos_embed:
874
+ clip_pos_embed = self.clip_img_pos_embed
875
+ else:
876
+ clip_cls_pos_embed = self.clip_pos_embed[:, 0:1, :]
877
+ clip_img_pos_embed = (
878
+ self.clip_pos_embed[:, 1:, :]
879
+ .view(
880
+ 1,
881
+ self.num_frames,
882
+ self.patch_embed.num_patches // self.num_frames,
883
+ self.embed_dim,
884
+ )
885
+ .mean(dim=1)
886
+ )
887
+ clip_pos_embed = torch.cat(
888
+ [clip_cls_pos_embed, clip_img_pos_embed], dim=1
889
+ )
890
+
891
+ else:
892
+ clip_pos_embed = self.clip_pos_embed
893
+
894
+ clip_pos_embed = clip_pos_embed.repeat(B, 1, 1)
895
+ if mask is not None:
896
+ x_clip = x_clip + clip_pos_embed[~mask].view(
897
+ B, -1, C_CLIP
898
+ ).unsqueeze(0).repeat(K, 1, 1, 1)
899
+ else:
900
+ x_clip = x_clip + clip_pos_embed.view(B, -1, C_CLIP).unsqueeze(
901
+ 0
902
+ ).repeat(K, 1, 1, 1)
903
+
904
+ # CLIP decoder
905
+ x_clip_align = []
906
+ for idx, clip_decoder in enumerate(self.clip_decoder):
907
+ x_clip_align.append(clip_decoder(x_clip[idx]))
908
+ x_clip_align = torch.stack(x_clip_align)
909
+
910
+ return x_vis, x_pool_vis, x_clip_align, x_align
911
+
912
+
913
+ def load_pretrained_weights(self):
914
+ if self.config.pretrained is not None:
915
+ logger.info(f"Loading pretrained weights from {self.config.pretrained}")
916
+ state_dict = torch.load(self.config.pretrained, map_location='cpu')
917
+
918
+ # Rename 'ls1.weight' to 'ls1.weight' and 'ls2.weight' to 'ls2.weight'
919
+ new_state_dict = {}
920
+ for key, value in state_dict.items():
921
+ if key.endswith('.ls1.weight'):
922
+ new_key = key.replace('.ls1.weight', '.ls1.weight')
923
+ new_state_dict[new_key] = value
924
+ elif key.endswith('.ls2.weight'):
925
+ new_key = key.replace('.ls2.weight', '.ls2.weight')
926
+ new_state_dict[new_key] = value
927
+ else:
928
+ new_state_dict[key] = value
929
+
930
+ # Load the adjusted state_dict
931
+ message = self.load_state_dict(new_state_dict, strict=False)
932
+ logger.info(message)
933
+ else:
934
+ logger.info("No pretrained weights provided.")
vision_tower/internvideo2/preprocessor_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Resize": {
3
+ "size": 224,
4
+ "interpolation": "bilinear"
5
+ },
6
+ "CenterCrop": {
7
+ "size": [
8
+ 224,
9
+ 224
10
+ ]
11
+ },
12
+ "ClipToTensor": {
13
+ "channel_nb": 3,
14
+ "div_255": true,
15
+ "numpy": false
16
+ },
17
+ "Normalize": {
18
+ "mean": [
19
+ 0.485,
20
+ 0.456,
21
+ 0.406
22
+ ],
23
+ "std": [
24
+ 0.229,
25
+ 0.224,
26
+ 0.225
27
+ ]
28
+ },
29
+ "image_processor_type": "transforms"
30
+ }
vision_tower/siglip-so400m-patch14-384/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/opt/hpcaas/.mounts/fs-0663e2d3c38211883/home/orrzohar/Artemis/work_dirs/final_run/apollo-Qwen2.5-7B-Instruct-internvideo2-siglip-so400m-patch14-384-freeze-perciver_128_2-newprompt-ft/checkpoint-13300/vision_tower/siglip-so400m-patch14-384",
3
+ "architectures": [
4
+ "SiglipVisionModel"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "hidden_act": "gelu_pytorch_tanh",
8
+ "hidden_size": 1152,
9
+ "image_size": 384,
10
+ "intermediate_size": 4304,
11
+ "layer_norm_eps": 1e-06,
12
+ "model_type": "siglip_vision_model",
13
+ "num_attention_heads": 16,
14
+ "num_channels": 3,
15
+ "num_hidden_layers": 27,
16
+ "patch_size": 14,
17
+ "torch_dtype": "bfloat16",
18
+ "transformers_version": "4.44.0"
19
+ }
vision_tower/siglip-so400m-patch14-384/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e9299e05ae6cc8366374648c1a8d202d57baa136e117c2203a23d135dbb0707
3
+ size 856506120
vision_tower/siglip-so400m-patch14-384/preprocessor_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "SiglipImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "processor_class": "SiglipProcessor",
18
+ "resample": 3,
19
+ "rescale_factor": 0.00392156862745098,
20
+ "size": {
21
+ "height": 384,
22
+ "width": 384
23
+ }
24
+ }