katuni4ka commited on
Commit
50e8eb3
1 Parent(s): e58d045

Upload 15 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<image>": 32000,
3
+ "<pad>": 32001
4
+ }
config.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "trl-internal-testing/tiny-random-LlavaForConditionalGeneration",
3
+ "architectures": [
4
+ "LlavaForConditionalGeneration"
5
+ ],
6
+ "ignore_index": -100,
7
+ "image_seq_length": 576,
8
+ "image_token_index": 32000,
9
+ "model_type": "llava",
10
+ "projector_hidden_act": "gelu",
11
+ "text_config": {
12
+ "_name_or_path": "HuggingFaceM4/tiny-random-LlamaForCausalLM",
13
+ "add_cross_attention": false,
14
+ "architectures": [
15
+ "LlamaForCausalLM"
16
+ ],
17
+ "attention_bias": false,
18
+ "attention_dropout": 0.0,
19
+ "bad_words_ids": null,
20
+ "begin_suppress_tokens": null,
21
+ "bos_token_id": 0,
22
+ "chunk_size_feed_forward": 0,
23
+ "cross_attention_hidden_size": null,
24
+ "decoder_start_token_id": null,
25
+ "diversity_penalty": 0.0,
26
+ "do_sample": false,
27
+ "early_stopping": false,
28
+ "encoder_no_repeat_ngram_size": 0,
29
+ "eos_token_id": 1,
30
+ "exponential_decay_length_penalty": null,
31
+ "finetuning_task": null,
32
+ "forced_bos_token_id": null,
33
+ "forced_eos_token_id": null,
34
+ "head_dim": 4,
35
+ "hidden_act": "silu",
36
+ "hidden_size": 16,
37
+ "id2label": {
38
+ "0": "LABEL_0",
39
+ "1": "LABEL_1"
40
+ },
41
+ "initializer_range": 0.02,
42
+ "intermediate_size": 64,
43
+ "is_decoder": false,
44
+ "is_encoder_decoder": false,
45
+ "label2id": {
46
+ "LABEL_0": 0,
47
+ "LABEL_1": 1
48
+ },
49
+ "length_penalty": 1.0,
50
+ "max_length": 20,
51
+ "max_position_embeddings": 2048,
52
+ "min_length": 0,
53
+ "mlp_bias": false,
54
+ "model_type": "llama",
55
+ "no_repeat_ngram_size": 0,
56
+ "num_attention_heads": 4,
57
+ "num_beam_groups": 1,
58
+ "num_beams": 1,
59
+ "num_hidden_layers": 2,
60
+ "num_key_value_heads": 4,
61
+ "num_return_sequences": 1,
62
+ "output_attentions": false,
63
+ "output_hidden_states": false,
64
+ "output_scores": false,
65
+ "pad_token_id": 2,
66
+ "prefix": null,
67
+ "pretraining_tp": 1,
68
+ "problem_type": null,
69
+ "pruned_heads": {},
70
+ "remove_invalid_values": false,
71
+ "repetition_penalty": 1.0,
72
+ "return_dict": true,
73
+ "return_dict_in_generate": false,
74
+ "rms_norm_eps": 1e-06,
75
+ "rope_scaling": null,
76
+ "rope_theta": 10000.0,
77
+ "sep_token_id": null,
78
+ "suppress_tokens": null,
79
+ "task_specific_params": null,
80
+ "temperature": 1.0,
81
+ "tf_legacy_loss": false,
82
+ "tie_encoder_decoder": false,
83
+ "tie_word_embeddings": false,
84
+ "tokenizer_class": null,
85
+ "top_k": 50,
86
+ "top_p": 1.0,
87
+ "torch_dtype": "float32",
88
+ "torchscript": false,
89
+ "typical_p": 1.0,
90
+ "use_bfloat16": false,
91
+ "use_cache": true,
92
+ "vocab_size": 32002
93
+ },
94
+ "torch_dtype": "float32",
95
+ "transformers_version": "4.45.2",
96
+ "vision_config": {
97
+ "_name_or_path": "",
98
+ "add_cross_attention": false,
99
+ "architectures": null,
100
+ "attention_dropout": 0.1,
101
+ "bad_words_ids": null,
102
+ "begin_suppress_tokens": null,
103
+ "bos_token_id": null,
104
+ "chunk_size_feed_forward": 0,
105
+ "cross_attention_hidden_size": null,
106
+ "decoder_start_token_id": null,
107
+ "diversity_penalty": 0.0,
108
+ "do_sample": false,
109
+ "dropout": 0.1,
110
+ "early_stopping": false,
111
+ "encoder_no_repeat_ngram_size": 0,
112
+ "eos_token_id": null,
113
+ "exponential_decay_length_penalty": null,
114
+ "finetuning_task": null,
115
+ "forced_bos_token_id": null,
116
+ "forced_eos_token_id": null,
117
+ "hidden_act": "quick_gelu",
118
+ "hidden_size": 32,
119
+ "id2label": {
120
+ "0": "LABEL_0",
121
+ "1": "LABEL_1"
122
+ },
123
+ "image_size": 30,
124
+ "initializer_factor": 1.0,
125
+ "initializer_range": 0.02,
126
+ "intermediate_size": 37,
127
+ "is_decoder": false,
128
+ "is_encoder_decoder": false,
129
+ "label2id": {
130
+ "LABEL_0": 0,
131
+ "LABEL_1": 1
132
+ },
133
+ "layer_norm_eps": 1e-05,
134
+ "length_penalty": 1.0,
135
+ "max_length": 20,
136
+ "min_length": 0,
137
+ "model_type": "clip_vision_model",
138
+ "no_repeat_ngram_size": 0,
139
+ "num_attention_heads": 4,
140
+ "num_beam_groups": 1,
141
+ "num_beams": 1,
142
+ "num_channels": 3,
143
+ "num_hidden_layers": 2,
144
+ "num_return_sequences": 1,
145
+ "output_attentions": false,
146
+ "output_hidden_states": false,
147
+ "output_scores": false,
148
+ "pad_token_id": null,
149
+ "patch_size": 2,
150
+ "prefix": null,
151
+ "problem_type": null,
152
+ "projection_dim": 32,
153
+ "pruned_heads": {},
154
+ "remove_invalid_values": false,
155
+ "repetition_penalty": 1.0,
156
+ "return_dict": true,
157
+ "return_dict_in_generate": false,
158
+ "sep_token_id": null,
159
+ "suppress_tokens": null,
160
+ "task_specific_params": null,
161
+ "temperature": 1.0,
162
+ "tf_legacy_loss": false,
163
+ "tie_encoder_decoder": false,
164
+ "tie_word_embeddings": true,
165
+ "tokenizer_class": null,
166
+ "top_k": 50,
167
+ "top_p": 1.0,
168
+ "torch_dtype": null,
169
+ "torchscript": false,
170
+ "typical_p": 1.0,
171
+ "use_bfloat16": false
172
+ },
173
+ "vision_feature_layer": -2,
174
+ "vision_feature_select_strategy": "default"
175
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 2,
6
+ "transformers_version": "4.45.2"
7
+ }
openvino_language_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e86d0d5b600ecda999fa7cbe7bbbdcb595270ec8e118f7204afad4d02d629b6
3
+ size 2081160
openvino_language_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
openvino_text_embeddings_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7ce8ab058b08c19b3d170f247ad76ba3fa6ab0df882161a984eaf51959f1ca3
3
+ size 2048132
openvino_text_embeddings_model.xml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="Model6" version="11">
3
+ <layers>
4
+ <layer id="0" name="input" type="Parameter" version="opset1">
5
+ <data shape="?,?" element_type="i64" />
6
+ <output>
7
+ <port id="0" precision="I64" names="input">
8
+ <dim>-1</dim>
9
+ <dim>-1</dim>
10
+ </port>
11
+ </output>
12
+ </layer>
13
+ <layer id="1" name="self.weight" type="Const" version="opset1">
14
+ <data element_type="f32" shape="32002, 16" offset="0" size="2048128" />
15
+ <output>
16
+ <port id="0" precision="FP32" names="self.weight">
17
+ <dim>32002</dim>
18
+ <dim>16</dim>
19
+ </port>
20
+ </output>
21
+ </layer>
22
+ <layer id="2" name="aten::embedding/Convert" type="Convert" version="opset1">
23
+ <data destination_type="i32" />
24
+ <input>
25
+ <port id="0" precision="I64">
26
+ <dim>-1</dim>
27
+ <dim>-1</dim>
28
+ </port>
29
+ </input>
30
+ <output>
31
+ <port id="1" precision="I32">
32
+ <dim>-1</dim>
33
+ <dim>-1</dim>
34
+ </port>
35
+ </output>
36
+ </layer>
37
+ <layer id="3" name="aten::embedding/Constant" type="Const" version="opset1">
38
+ <data element_type="i32" shape="" offset="2048128" size="4" />
39
+ <output>
40
+ <port id="0" precision="I32" />
41
+ </output>
42
+ </layer>
43
+ <layer id="4" name="aten::embedding/Gather" type="Gather" version="opset8">
44
+ <data batch_dims="0" />
45
+ <input>
46
+ <port id="0" precision="FP32">
47
+ <dim>32002</dim>
48
+ <dim>16</dim>
49
+ </port>
50
+ <port id="1" precision="I32">
51
+ <dim>-1</dim>
52
+ <dim>-1</dim>
53
+ </port>
54
+ <port id="2" precision="I32" />
55
+ </input>
56
+ <output>
57
+ <port id="3" precision="FP32" names="inputs_embeds">
58
+ <dim>-1</dim>
59
+ <dim>-1</dim>
60
+ <dim>16</dim>
61
+ </port>
62
+ </output>
63
+ </layer>
64
+ <layer id="5" name="Result_13835" type="Result" version="opset1">
65
+ <input>
66
+ <port id="0" precision="FP32">
67
+ <dim>-1</dim>
68
+ <dim>-1</dim>
69
+ <dim>16</dim>
70
+ </port>
71
+ </input>
72
+ </layer>
73
+ </layers>
74
+ <edges>
75
+ <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
76
+ <edge from-layer="1" from-port="0" to-layer="4" to-port="0" />
77
+ <edge from-layer="2" from-port="1" to-layer="4" to-port="1" />
78
+ <edge from-layer="3" from-port="0" to-layer="4" to-port="2" />
79
+ <edge from-layer="4" from-port="3" to-layer="5" to-port="0" />
80
+ </edges>
81
+ <rt_info>
82
+ <Runtime_version value="2024.5.0-17202-a7ccc5e0efc" />
83
+ <conversion_parameters>
84
+ <framework value="pytorch" />
85
+ <is_python_object value="True" />
86
+ </conversion_parameters>
87
+ <optimum>
88
+ <optimum_intel_version value="1.20.0.dev0+7cc52a7" />
89
+ <optimum_version value="1.23.2" />
90
+ <pytorch_version value="2.5.1" />
91
+ <transformers_version value="4.45.2" />
92
+ </optimum>
93
+ </rt_info>
94
+ </net>
openvino_vision_embeddings_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:944384bdce22c00528a9bc2ecc49a1576370be263d860dd162cf0f70f918b70a
3
+ size 59668
openvino_vision_embeddings_model.xml ADDED
@@ -0,0 +1,1114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="Model3" version="11">
3
+ <layers>
4
+ <layer id="0" name="pixel_values" type="Parameter" version="opset1">
5
+ <data shape="?,3,?,?" element_type="f32" />
6
+ <output>
7
+ <port id="0" precision="FP32" names="pixel_values">
8
+ <dim>-1</dim>
9
+ <dim>3</dim>
10
+ <dim>-1</dim>
11
+ <dim>-1</dim>
12
+ </port>
13
+ </output>
14
+ </layer>
15
+ <layer id="1" name="self.vision_tower.vision_model.embeddings.class_embedding" type="Const" version="opset1">
16
+ <data element_type="f32" shape="32" offset="0" size="128" />
17
+ <output>
18
+ <port id="0" precision="FP32" names="self.vision_tower.vision_model.embeddings.class_embedding">
19
+ <dim>32</dim>
20
+ </port>
21
+ </output>
22
+ </layer>
23
+ <layer id="2" name="self.vision_tower.vision_model.embeddings.patch_embedding.weight" type="Const" version="opset1">
24
+ <data element_type="f32" shape="32, 3, 2, 2" offset="128" size="1536" />
25
+ <output>
26
+ <port id="0" precision="FP32" names="self.vision_tower.vision_model.embeddings.patch_embedding.weight">
27
+ <dim>32</dim>
28
+ <dim>3</dim>
29
+ <dim>2</dim>
30
+ <dim>2</dim>
31
+ </port>
32
+ </output>
33
+ </layer>
34
+ <layer id="3" name="__module.vision_tower.vision_model.embeddings.patch_embedding/aten::_convolution/Convolution" type="Convolution" version="opset1">
35
+ <data strides="2, 2" dilations="1, 1" pads_begin="0, 0" pads_end="0, 0" auto_pad="explicit" />
36
+ <input>
37
+ <port id="0" precision="FP32">
38
+ <dim>-1</dim>
39
+ <dim>3</dim>
40
+ <dim>-1</dim>
41
+ <dim>-1</dim>
42
+ </port>
43
+ <port id="1" precision="FP32">
44
+ <dim>32</dim>
45
+ <dim>3</dim>
46
+ <dim>2</dim>
47
+ <dim>2</dim>
48
+ </port>
49
+ </input>
50
+ <output>
51
+ <port id="2" precision="FP32" names="49,patch_embeds">
52
+ <dim>-1</dim>
53
+ <dim>32</dim>
54
+ <dim>-1</dim>
55
+ <dim>-1</dim>
56
+ </port>
57
+ </output>
58
+ </layer>
59
+ <layer id="4" name="ShapeOf_13579" type="ShapeOf" version="opset3">
60
+ <data output_type="i64" />
61
+ <input>
62
+ <port id="0" precision="FP32">
63
+ <dim>-1</dim>
64
+ <dim>32</dim>
65
+ <dim>-1</dim>
66
+ <dim>-1</dim>
67
+ </port>
68
+ </input>
69
+ <output>
70
+ <port id="1" precision="I64">
71
+ <dim>4</dim>
72
+ </port>
73
+ </output>
74
+ </layer>
75
+ <layer id="5" name="Constant_13580" type="Const" version="opset1">
76
+ <data element_type="i64" shape="1" offset="1664" size="8" />
77
+ <output>
78
+ <port id="0" precision="I64">
79
+ <dim>1</dim>
80
+ </port>
81
+ </output>
82
+ </layer>
83
+ <layer id="6" name="Constant_13581" type="Const" version="opset1">
84
+ <data element_type="i64" shape="" offset="1664" size="8" />
85
+ <output>
86
+ <port id="0" precision="I64" />
87
+ </output>
88
+ </layer>
89
+ <layer id="7" name="Gather_13582" type="Gather" version="opset8">
90
+ <data batch_dims="0" />
91
+ <input>
92
+ <port id="0" precision="I64">
93
+ <dim>4</dim>
94
+ </port>
95
+ <port id="1" precision="I64">
96
+ <dim>1</dim>
97
+ </port>
98
+ <port id="2" precision="I64" />
99
+ </input>
100
+ <output>
101
+ <port id="3" precision="I64" names="42,79">
102
+ <dim>1</dim>
103
+ </port>
104
+ </output>
105
+ </layer>
106
+ <layer id="8" name="Constant_13467" type="Const" version="opset1">
107
+ <data element_type="i64" shape="1" offset="1672" size="8" />
108
+ <output>
109
+ <port id="0" precision="I64">
110
+ <dim>1</dim>
111
+ </port>
112
+ </output>
113
+ </layer>
114
+ <layer id="9" name="Constant_13469" type="Const" version="opset1">
115
+ <data element_type="i64" shape="1" offset="1672" size="8" />
116
+ <output>
117
+ <port id="0" precision="I64">
118
+ <dim>1</dim>
119
+ </port>
120
+ </output>
121
+ </layer>
122
+ <layer id="10" name="__module.vision_tower.vision_model.embeddings/prim::ListConstruct" type="Concat" version="opset1">
123
+ <data axis="0" />
124
+ <input>
125
+ <port id="0" precision="I64">
126
+ <dim>1</dim>
127
+ </port>
128
+ <port id="1" precision="I64">
129
+ <dim>1</dim>
130
+ </port>
131
+ <port id="2" precision="I64">
132
+ <dim>1</dim>
133
+ </port>
134
+ </input>
135
+ <output>
136
+ <port id="3" precision="I64">
137
+ <dim>3</dim>
138
+ </port>
139
+ </output>
140
+ </layer>
141
+ <layer id="11" name="__module.vision_tower.vision_model.embeddings/aten::expand/Broadcast" type="Broadcast" version="opset3">
142
+ <data mode="bidirectional" />
143
+ <input>
144
+ <port id="0" precision="FP32">
145
+ <dim>32</dim>
146
+ </port>
147
+ <port id="1" precision="I64">
148
+ <dim>3</dim>
149
+ </port>
150
+ </input>
151
+ <output>
152
+ <port id="2" precision="FP32" names="53">
153
+ <dim>-1</dim>
154
+ <dim>1</dim>
155
+ <dim>32</dim>
156
+ </port>
157
+ </output>
158
+ </layer>
159
+ <layer id="12" name="Constant_13591" type="Const" version="opset1">
160
+ <data element_type="i64" shape="3" offset="1680" size="24" />
161
+ <output>
162
+ <port id="0" precision="I64">
163
+ <dim>3</dim>
164
+ </port>
165
+ </output>
166
+ </layer>
167
+ <layer id="13" name="__module.vision_tower.vision_model.embeddings/aten::flatten/Reshape" type="Reshape" version="opset1">
168
+ <data special_zero="true" />
169
+ <input>
170
+ <port id="0" precision="FP32">
171
+ <dim>-1</dim>
172
+ <dim>32</dim>
173
+ <dim>-1</dim>
174
+ <dim>-1</dim>
175
+ </port>
176
+ <port id="1" precision="I64">
177
+ <dim>3</dim>
178
+ </port>
179
+ </input>
180
+ <output>
181
+ <port id="2" precision="FP32" names="50">
182
+ <dim>-1</dim>
183
+ <dim>32</dim>
184
+ <dim>-1</dim>
185
+ </port>
186
+ </output>
187
+ </layer>
188
+ <layer id="14" name="__module.vision_tower.vision_model.embeddings/aten::transpose/Constant" type="Const" version="opset1">
189
+ <data element_type="i32" shape="3" offset="1704" size="12" />
190
+ <output>
191
+ <port id="0" precision="I32">
192
+ <dim>3</dim>
193
+ </port>
194
+ </output>
195
+ </layer>
196
+ <layer id="15" name="__module.vision_tower.vision_model.embeddings/aten::transpose/Transpose" type="Transpose" version="opset1">
197
+ <input>
198
+ <port id="0" precision="FP32">
199
+ <dim>-1</dim>
200
+ <dim>32</dim>
201
+ <dim>-1</dim>
202
+ </port>
203
+ <port id="1" precision="I32">
204
+ <dim>3</dim>
205
+ </port>
206
+ </input>
207
+ <output>
208
+ <port id="2" precision="FP32" names="51">
209
+ <dim>-1</dim>
210
+ <dim>-1</dim>
211
+ <dim>32</dim>
212
+ </port>
213
+ </output>
214
+ </layer>
215
+ <layer id="16" name="__module.vision_tower.vision_model.embeddings/aten::cat/Concat" type="Concat" version="opset1">
216
+ <data axis="1" />
217
+ <input>
218
+ <port id="0" precision="FP32">
219
+ <dim>-1</dim>
220
+ <dim>1</dim>
221
+ <dim>32</dim>
222
+ </port>
223
+ <port id="1" precision="FP32">
224
+ <dim>-1</dim>
225
+ <dim>-1</dim>
226
+ <dim>32</dim>
227
+ </port>
228
+ </input>
229
+ <output>
230
+ <port id="2" precision="FP32" names="55,embeddings.1">
231
+ <dim>-1</dim>
232
+ <dim>-1</dim>
233
+ <dim>32</dim>
234
+ </port>
235
+ </output>
236
+ </layer>
237
+ <layer id="17" name="__module.vision_tower.vision_model.embeddings/aten::add/Multiply" type="Const" version="opset1">
238
+ <data element_type="f32" shape="1, 226, 32" offset="1716" size="28928" />
239
+ <output>
240
+ <port id="0" precision="FP32">
241
+ <dim>1</dim>
242
+ <dim>226</dim>
243
+ <dim>32</dim>
244
+ </port>
245
+ </output>
246
+ </layer>
247
+ <layer id="18" name="__module.vision_tower.vision_model.embeddings/aten::add/Add" type="Add" version="opset1">
248
+ <data auto_broadcast="numpy" />
249
+ <input>
250
+ <port id="0" precision="FP32">
251
+ <dim>-1</dim>
252
+ <dim>-1</dim>
253
+ <dim>32</dim>
254
+ </port>
255
+ <port id="1" precision="FP32">
256
+ <dim>1</dim>
257
+ <dim>226</dim>
258
+ <dim>32</dim>
259
+ </port>
260
+ </input>
261
+ <output>
262
+ <port id="2" precision="FP32" names="58">
263
+ <dim>-1</dim>
264
+ <dim>226</dim>
265
+ <dim>32</dim>
266
+ </port>
267
+ </output>
268
+ </layer>
269
+ <layer id="19" name="__module.vision_tower.vision_model.pre_layrnorm/aten::layer_norm/Multiply" type="Const" version="opset1">
270
+ <data element_type="i32" shape="1" offset="30644" size="4" />
271
+ <output>
272
+ <port id="0" precision="I32">
273
+ <dim>1</dim>
274
+ </port>
275
+ </output>
276
+ </layer>
277
+ <layer id="20" name="__module.vision_tower.vision_model.pre_layrnorm/aten::layer_norm/MVN" type="MVN" version="opset6">
278
+ <data eps="9.9999997473787516e-06" normalize_variance="true" eps_mode="INSIDE_SQRT" />
279
+ <input>
280
+ <port id="0" precision="FP32">
281
+ <dim>-1</dim>
282
+ <dim>226</dim>
283
+ <dim>32</dim>
284
+ </port>
285
+ <port id="1" precision="I32">
286
+ <dim>1</dim>
287
+ </port>
288
+ </input>
289
+ <output>
290
+ <port id="2" precision="FP32" names="62,residual.1">
291
+ <dim>-1</dim>
292
+ <dim>226</dim>
293
+ <dim>32</dim>
294
+ </port>
295
+ </output>
296
+ </layer>
297
+ <layer id="21" name="__module.vision_tower.vision_model.encoder.layers.0.layer_norm1/aten::layer_norm/Multiply" type="Const" version="opset1">
298
+ <data element_type="i32" shape="1" offset="30644" size="4" />
299
+ <output>
300
+ <port id="0" precision="I32">
301
+ <dim>1</dim>
302
+ </port>
303
+ </output>
304
+ </layer>
305
+ <layer id="22" name="__module.vision_tower.vision_model.encoder.layers.0.layer_norm1/aten::layer_norm/MVN" type="MVN" version="opset6">
306
+ <data eps="9.9999997473787516e-06" normalize_variance="true" eps_mode="INSIDE_SQRT" />
307
+ <input>
308
+ <port id="0" precision="FP32">
309
+ <dim>-1</dim>
310
+ <dim>226</dim>
311
+ <dim>32</dim>
312
+ </port>
313
+ <port id="1" precision="I32">
314
+ <dim>1</dim>
315
+ </port>
316
+ </input>
317
+ <output>
318
+ <port id="2" precision="FP32" names="74,hidden_states.1">
319
+ <dim>-1</dim>
320
+ <dim>226</dim>
321
+ <dim>32</dim>
322
+ </port>
323
+ </output>
324
+ </layer>
325
+ <layer id="23" name="self.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight" type="Const" version="opset1">
326
+ <data element_type="f32" shape="32, 32" offset="30648" size="4096" />
327
+ <output>
328
+ <port id="0" precision="FP32" names="self.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight">
329
+ <dim>32</dim>
330
+ <dim>32</dim>
331
+ </port>
332
+ </output>
333
+ </layer>
334
+ <layer id="24" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj/aten::linear/MatMul" type="MatMul" version="opset1">
335
+ <data transpose_a="false" transpose_b="true" />
336
+ <input>
337
+ <port id="0" precision="FP32">
338
+ <dim>-1</dim>
339
+ <dim>226</dim>
340
+ <dim>32</dim>
341
+ </port>
342
+ <port id="1" precision="FP32">
343
+ <dim>32</dim>
344
+ <dim>32</dim>
345
+ </port>
346
+ </input>
347
+ <output>
348
+ <port id="2" precision="FP32" names="84,query_states.1">
349
+ <dim>-1</dim>
350
+ <dim>226</dim>
351
+ <dim>32</dim>
352
+ </port>
353
+ </output>
354
+ </layer>
355
+ <layer id="25" name="Constant_13592" type="Const" version="opset1">
356
+ <data element_type="i64" shape="4" offset="34744" size="32" />
357
+ <output>
358
+ <port id="0" precision="I64">
359
+ <dim>4</dim>
360
+ </port>
361
+ </output>
362
+ </layer>
363
+ <layer id="26" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::view/Reshape" type="Reshape" version="opset1">
364
+ <data special_zero="true" />
365
+ <input>
366
+ <port id="0" precision="FP32">
367
+ <dim>-1</dim>
368
+ <dim>226</dim>
369
+ <dim>32</dim>
370
+ </port>
371
+ <port id="1" precision="I64">
372
+ <dim>4</dim>
373
+ </port>
374
+ </input>
375
+ <output>
376
+ <port id="2" precision="FP32" names="92">
377
+ <dim>-1</dim>
378
+ <dim>226</dim>
379
+ <dim>4</dim>
380
+ <dim>8</dim>
381
+ </port>
382
+ </output>
383
+ </layer>
384
+ <layer id="27" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::transpose/Constant" type="Const" version="opset1">
385
+ <data element_type="i32" shape="4" offset="34776" size="16" />
386
+ <output>
387
+ <port id="0" precision="I32">
388
+ <dim>4</dim>
389
+ </port>
390
+ </output>
391
+ </layer>
392
+ <layer id="28" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::transpose/Transpose" type="Transpose" version="opset1">
393
+ <input>
394
+ <port id="0" precision="FP32">
395
+ <dim>-1</dim>
396
+ <dim>226</dim>
397
+ <dim>4</dim>
398
+ <dim>8</dim>
399
+ </port>
400
+ <port id="1" precision="I32">
401
+ <dim>4</dim>
402
+ </port>
403
+ </input>
404
+ <output>
405
+ <port id="2" precision="FP32" names="93">
406
+ <dim>-1</dim>
407
+ <dim>4</dim>
408
+ <dim>226</dim>
409
+ <dim>8</dim>
410
+ </port>
411
+ </output>
412
+ </layer>
413
+ <layer id="29" name="self.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight" type="Const" version="opset1">
414
+ <data element_type="f32" shape="32, 32" offset="34792" size="4096" />
415
+ <output>
416
+ <port id="0" precision="FP32" names="self.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight">
417
+ <dim>32</dim>
418
+ <dim>32</dim>
419
+ </port>
420
+ </output>
421
+ </layer>
422
+ <layer id="30" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj/aten::linear/MatMul" type="MatMul" version="opset1">
423
+ <data transpose_a="false" transpose_b="true" />
424
+ <input>
425
+ <port id="0" precision="FP32">
426
+ <dim>-1</dim>
427
+ <dim>226</dim>
428
+ <dim>32</dim>
429
+ </port>
430
+ <port id="1" precision="FP32">
431
+ <dim>32</dim>
432
+ <dim>32</dim>
433
+ </port>
434
+ </input>
435
+ <output>
436
+ <port id="2" precision="FP32" names="87,key_states.1">
437
+ <dim>-1</dim>
438
+ <dim>226</dim>
439
+ <dim>32</dim>
440
+ </port>
441
+ </output>
442
+ </layer>
443
+ <layer id="31" name="Constant_13593" type="Const" version="opset1">
444
+ <data element_type="i64" shape="4" offset="34744" size="32" />
445
+ <output>
446
+ <port id="0" precision="I64">
447
+ <dim>4</dim>
448
+ </port>
449
+ </output>
450
+ </layer>
451
+ <layer id="32" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::view/Reshape_1" type="Reshape" version="opset1">
452
+ <data special_zero="true" />
453
+ <input>
454
+ <port id="0" precision="FP32">
455
+ <dim>-1</dim>
456
+ <dim>226</dim>
457
+ <dim>32</dim>
458
+ </port>
459
+ <port id="1" precision="I64">
460
+ <dim>4</dim>
461
+ </port>
462
+ </input>
463
+ <output>
464
+ <port id="2" precision="FP32" names="95">
465
+ <dim>-1</dim>
466
+ <dim>226</dim>
467
+ <dim>4</dim>
468
+ <dim>8</dim>
469
+ </port>
470
+ </output>
471
+ </layer>
472
+ <layer id="33" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::transpose/Constant_1" type="Const" version="opset1">
473
+ <data element_type="i32" shape="4" offset="34776" size="16" />
474
+ <output>
475
+ <port id="0" precision="I32">
476
+ <dim>4</dim>
477
+ </port>
478
+ </output>
479
+ </layer>
480
+ <layer id="34" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::transpose/Transpose_1" type="Transpose" version="opset1">
481
+ <input>
482
+ <port id="0" precision="FP32">
483
+ <dim>-1</dim>
484
+ <dim>226</dim>
485
+ <dim>4</dim>
486
+ <dim>8</dim>
487
+ </port>
488
+ <port id="1" precision="I32">
489
+ <dim>4</dim>
490
+ </port>
491
+ </input>
492
+ <output>
493
+ <port id="2" precision="FP32" names="96">
494
+ <dim>-1</dim>
495
+ <dim>4</dim>
496
+ <dim>226</dim>
497
+ <dim>8</dim>
498
+ </port>
499
+ </output>
500
+ </layer>
501
+ <layer id="35" name="self.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight" type="Const" version="opset1">
502
+ <data element_type="f32" shape="32, 32" offset="38888" size="4096" />
503
+ <output>
504
+ <port id="0" precision="FP32" names="self.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight">
505
+ <dim>32</dim>
506
+ <dim>32</dim>
507
+ </port>
508
+ </output>
509
+ </layer>
510
+ <layer id="36" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj/aten::linear/MatMul" type="MatMul" version="opset1">
511
+ <data transpose_a="false" transpose_b="true" />
512
+ <input>
513
+ <port id="0" precision="FP32">
514
+ <dim>-1</dim>
515
+ <dim>226</dim>
516
+ <dim>32</dim>
517
+ </port>
518
+ <port id="1" precision="FP32">
519
+ <dim>32</dim>
520
+ <dim>32</dim>
521
+ </port>
522
+ </input>
523
+ <output>
524
+ <port id="2" precision="FP32" names="90,value_states.1">
525
+ <dim>-1</dim>
526
+ <dim>226</dim>
527
+ <dim>32</dim>
528
+ </port>
529
+ </output>
530
+ </layer>
531
+ <layer id="37" name="Constant_13594" type="Const" version="opset1">
532
+ <data element_type="i64" shape="4" offset="34744" size="32" />
533
+ <output>
534
+ <port id="0" precision="I64">
535
+ <dim>4</dim>
536
+ </port>
537
+ </output>
538
+ </layer>
539
+ <layer id="38" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::view/Reshape_2" type="Reshape" version="opset1">
540
+ <data special_zero="true" />
541
+ <input>
542
+ <port id="0" precision="FP32">
543
+ <dim>-1</dim>
544
+ <dim>226</dim>
545
+ <dim>32</dim>
546
+ </port>
547
+ <port id="1" precision="I64">
548
+ <dim>4</dim>
549
+ </port>
550
+ </input>
551
+ <output>
552
+ <port id="2" precision="FP32" names="98">
553
+ <dim>-1</dim>
554
+ <dim>226</dim>
555
+ <dim>4</dim>
556
+ <dim>8</dim>
557
+ </port>
558
+ </output>
559
+ </layer>
560
+ <layer id="39" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::transpose/Constant_2" type="Const" version="opset1">
561
+ <data element_type="i32" shape="4" offset="34776" size="16" />
562
+ <output>
563
+ <port id="0" precision="I32">
564
+ <dim>4</dim>
565
+ </port>
566
+ </output>
567
+ </layer>
568
+ <layer id="40" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::transpose/Transpose_2" type="Transpose" version="opset1">
569
+ <input>
570
+ <port id="0" precision="FP32">
571
+ <dim>-1</dim>
572
+ <dim>226</dim>
573
+ <dim>4</dim>
574
+ <dim>8</dim>
575
+ </port>
576
+ <port id="1" precision="I32">
577
+ <dim>4</dim>
578
+ </port>
579
+ </input>
580
+ <output>
581
+ <port id="2" precision="FP32" names="99">
582
+ <dim>-1</dim>
583
+ <dim>4</dim>
584
+ <dim>226</dim>
585
+ <dim>8</dim>
586
+ </port>
587
+ </output>
588
+ </layer>
589
+ <layer id="41" name="Constant_11010" type="Const" version="opset1">
590
+ <data element_type="f32" shape="" offset="42984" size="4" />
591
+ <output>
592
+ <port id="0" precision="FP32" />
593
+ </output>
594
+ </layer>
595
+ <layer id="42" name="27" type="Const" version="opset1">
596
+ <data element_type="f32" shape="" offset="42988" size="4" />
597
+ <output>
598
+ <port id="0" precision="FP32" names="27" />
599
+ </output>
600
+ </layer>
601
+ <layer id="43" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::scaled_dot_product_attention/ScaledDotProductAttention" type="ScaledDotProductAttention" version="opset13">
602
+ <data causal="false" />
603
+ <input>
604
+ <port id="0" precision="FP32">
605
+ <dim>-1</dim>
606
+ <dim>4</dim>
607
+ <dim>226</dim>
608
+ <dim>8</dim>
609
+ </port>
610
+ <port id="1" precision="FP32">
611
+ <dim>-1</dim>
612
+ <dim>4</dim>
613
+ <dim>226</dim>
614
+ <dim>8</dim>
615
+ </port>
616
+ <port id="2" precision="FP32">
617
+ <dim>-1</dim>
618
+ <dim>4</dim>
619
+ <dim>226</dim>
620
+ <dim>8</dim>
621
+ </port>
622
+ <port id="3" precision="FP32" />
623
+ <port id="4" precision="FP32" />
624
+ </input>
625
+ <output>
626
+ <port id="5" precision="FP32" names="100,attn_output.1">
627
+ <dim>-1</dim>
628
+ <dim>4</dim>
629
+ <dim>226</dim>
630
+ <dim>8</dim>
631
+ </port>
632
+ </output>
633
+ </layer>
634
+ <layer id="44" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::transpose/Constant_3" type="Const" version="opset1">
635
+ <data element_type="i32" shape="4" offset="34776" size="16" />
636
+ <output>
637
+ <port id="0" precision="I32">
638
+ <dim>4</dim>
639
+ </port>
640
+ </output>
641
+ </layer>
642
+ <layer id="45" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::transpose/Transpose_3" type="Transpose" version="opset1">
643
+ <input>
644
+ <port id="0" precision="FP32">
645
+ <dim>-1</dim>
646
+ <dim>4</dim>
647
+ <dim>226</dim>
648
+ <dim>8</dim>
649
+ </port>
650
+ <port id="1" precision="I32">
651
+ <dim>4</dim>
652
+ </port>
653
+ </input>
654
+ <output>
655
+ <port id="2" precision="FP32" names="101,attn_output.3">
656
+ <dim>-1</dim>
657
+ <dim>226</dim>
658
+ <dim>4</dim>
659
+ <dim>8</dim>
660
+ </port>
661
+ </output>
662
+ </layer>
663
+ <layer id="46" name="Constant_13595" type="Const" version="opset1">
664
+ <data element_type="i64" shape="3" offset="42992" size="24" />
665
+ <output>
666
+ <port id="0" precision="I64">
667
+ <dim>3</dim>
668
+ </port>
669
+ </output>
670
+ </layer>
671
+ <layer id="47" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn/aten::reshape/Reshape" type="Reshape" version="opset1">
672
+ <data special_zero="true" />
673
+ <input>
674
+ <port id="0" precision="FP32">
675
+ <dim>-1</dim>
676
+ <dim>226</dim>
677
+ <dim>4</dim>
678
+ <dim>8</dim>
679
+ </port>
680
+ <port id="1" precision="I64">
681
+ <dim>3</dim>
682
+ </port>
683
+ </input>
684
+ <output>
685
+ <port id="2" precision="FP32" names="103">
686
+ <dim>-1</dim>
687
+ <dim>226</dim>
688
+ <dim>32</dim>
689
+ </port>
690
+ </output>
691
+ </layer>
692
+ <layer id="48" name="self.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight" type="Const" version="opset1">
693
+ <data element_type="f32" shape="32, 32" offset="43016" size="4096" />
694
+ <output>
695
+ <port id="0" precision="FP32" names="self.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight">
696
+ <dim>32</dim>
697
+ <dim>32</dim>
698
+ </port>
699
+ </output>
700
+ </layer>
701
+ <layer id="49" name="__module.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj/aten::linear/MatMul" type="MatMul" version="opset1">
702
+ <data transpose_a="false" transpose_b="true" />
703
+ <input>
704
+ <port id="0" precision="FP32">
705
+ <dim>-1</dim>
706
+ <dim>226</dim>
707
+ <dim>32</dim>
708
+ </port>
709
+ <port id="1" precision="FP32">
710
+ <dim>32</dim>
711
+ <dim>32</dim>
712
+ </port>
713
+ </input>
714
+ <output>
715
+ <port id="2" precision="FP32" names="106,hidden_states.3">
716
+ <dim>-1</dim>
717
+ <dim>226</dim>
718
+ <dim>32</dim>
719
+ </port>
720
+ </output>
721
+ </layer>
722
+ <layer id="50" name="__module.vision_tower.vision_model.encoder.layers.0/aten::add/Add" type="Add" version="opset1">
723
+ <data auto_broadcast="numpy" />
724
+ <input>
725
+ <port id="0" precision="FP32">
726
+ <dim>-1</dim>
727
+ <dim>226</dim>
728
+ <dim>32</dim>
729
+ </port>
730
+ <port id="1" precision="FP32">
731
+ <dim>-1</dim>
732
+ <dim>226</dim>
733
+ <dim>32</dim>
734
+ </port>
735
+ </input>
736
+ <output>
737
+ <port id="2" precision="FP32" names="107,residual.3">
738
+ <dim>-1</dim>
739
+ <dim>226</dim>
740
+ <dim>32</dim>
741
+ </port>
742
+ </output>
743
+ </layer>
744
+ <layer id="51" name="__module.vision_tower.vision_model.encoder.layers.0.layer_norm2/aten::layer_norm/Multiply" type="Const" version="opset1">
745
+ <data element_type="i32" shape="1" offset="30644" size="4" />
746
+ <output>
747
+ <port id="0" precision="I32">
748
+ <dim>1</dim>
749
+ </port>
750
+ </output>
751
+ </layer>
752
+ <layer id="52" name="__module.vision_tower.vision_model.encoder.layers.0.layer_norm2/aten::layer_norm/MVN" type="MVN" version="opset6">
753
+ <data eps="9.9999997473787516e-06" normalize_variance="true" eps_mode="INSIDE_SQRT" />
754
+ <input>
755
+ <port id="0" precision="FP32">
756
+ <dim>-1</dim>
757
+ <dim>226</dim>
758
+ <dim>32</dim>
759
+ </port>
760
+ <port id="1" precision="I32">
761
+ <dim>1</dim>
762
+ </port>
763
+ </input>
764
+ <output>
765
+ <port id="2" precision="FP32" names="111">
766
+ <dim>-1</dim>
767
+ <dim>226</dim>
768
+ <dim>32</dim>
769
+ </port>
770
+ </output>
771
+ </layer>
772
+ <layer id="53" name="self.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight" type="Const" version="opset1">
773
+ <data element_type="f32" shape="37, 32" offset="47112" size="4736" />
774
+ <output>
775
+ <port id="0" precision="FP32" names="self.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight">
776
+ <dim>37</dim>
777
+ <dim>32</dim>
778
+ </port>
779
+ </output>
780
+ </layer>
781
+ <layer id="54" name="__module.vision_tower.vision_model.encoder.layers.0.mlp.fc1/aten::linear/MatMul" type="MatMul" version="opset1">
782
+ <data transpose_a="false" transpose_b="true" />
783
+ <input>
784
+ <port id="0" precision="FP32">
785
+ <dim>-1</dim>
786
+ <dim>226</dim>
787
+ <dim>32</dim>
788
+ </port>
789
+ <port id="1" precision="FP32">
790
+ <dim>37</dim>
791
+ <dim>32</dim>
792
+ </port>
793
+ </input>
794
+ <output>
795
+ <port id="2" precision="FP32" names="117,input.1">
796
+ <dim>-1</dim>
797
+ <dim>226</dim>
798
+ <dim>37</dim>
799
+ </port>
800
+ </output>
801
+ </layer>
802
+ <layer id="55" name="Constant_13523" type="Const" version="opset1">
803
+ <data element_type="f32" shape="" offset="51848" size="4" />
804
+ <output>
805
+ <port id="0" precision="FP32" />
806
+ </output>
807
+ </layer>
808
+ <layer id="56" name="__module.vision_tower.vision_model.encoder.layers.0.mlp.activation_fn/aten::mul/Multiply_1" type="Swish" version="opset4">
809
+ <input>
810
+ <port id="0" precision="FP32">
811
+ <dim>-1</dim>
812
+ <dim>226</dim>
813
+ <dim>37</dim>
814
+ </port>
815
+ <port id="1" precision="FP32" />
816
+ </input>
817
+ <output>
818
+ <port id="2" precision="FP32" names="120">
819
+ <dim>-1</dim>
820
+ <dim>226</dim>
821
+ <dim>37</dim>
822
+ </port>
823
+ </output>
824
+ </layer>
825
+ <layer id="57" name="self.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight" type="Const" version="opset1">
826
+ <data element_type="f32" shape="32, 37" offset="51852" size="4736" />
827
+ <output>
828
+ <port id="0" precision="FP32" names="self.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight">
829
+ <dim>32</dim>
830
+ <dim>37</dim>
831
+ </port>
832
+ </output>
833
+ </layer>
834
+ <layer id="58" name="__module.vision_tower.vision_model.encoder.layers.0.mlp.fc2/aten::linear/MatMul" type="MatMul" version="opset1">
835
+ <data transpose_a="false" transpose_b="true" />
836
+ <input>
837
+ <port id="0" precision="FP32">
838
+ <dim>-1</dim>
839
+ <dim>226</dim>
840
+ <dim>37</dim>
841
+ </port>
842
+ <port id="1" precision="FP32">
843
+ <dim>32</dim>
844
+ <dim>37</dim>
845
+ </port>
846
+ </input>
847
+ <output>
848
+ <port id="2" precision="FP32" names="123,hidden_states.5">
849
+ <dim>-1</dim>
850
+ <dim>226</dim>
851
+ <dim>32</dim>
852
+ </port>
853
+ </output>
854
+ </layer>
855
+ <layer id="59" name="__module.vision_tower.vision_model.encoder.layers.0/aten::add/Add_1" type="Add" version="opset1">
856
+ <data auto_broadcast="numpy" />
857
+ <input>
858
+ <port id="0" precision="FP32">
859
+ <dim>-1</dim>
860
+ <dim>226</dim>
861
+ <dim>32</dim>
862
+ </port>
863
+ <port id="1" precision="FP32">
864
+ <dim>-1</dim>
865
+ <dim>226</dim>
866
+ <dim>32</dim>
867
+ </port>
868
+ </input>
869
+ <output>
870
+ <port id="2" precision="FP32" names="124,185,9,residual.5">
871
+ <dim>-1</dim>
872
+ <dim>226</dim>
873
+ <dim>32</dim>
874
+ </port>
875
+ </output>
876
+ </layer>
877
+ <layer id="60" name="Constant_11629" type="Const" version="opset1">
878
+ <data element_type="i64" shape="1" offset="1672" size="8" />
879
+ <output>
880
+ <port id="0" precision="I64">
881
+ <dim>1</dim>
882
+ </port>
883
+ </output>
884
+ </layer>
885
+ <layer id="61" name="Constant_11631" type="Const" version="opset1">
886
+ <data element_type="i64" shape="1" offset="56588" size="8" />
887
+ <output>
888
+ <port id="0" precision="I64">
889
+ <dim>1</dim>
890
+ </port>
891
+ </output>
892
+ </layer>
893
+ <layer id="62" name="Constant_11633" type="Const" version="opset1">
894
+ <data element_type="i64" shape="1" offset="1672" size="8" />
895
+ <output>
896
+ <port id="0" precision="I64">
897
+ <dim>1</dim>
898
+ </port>
899
+ </output>
900
+ </layer>
901
+ <layer id="63" name="aten::slice/Reshape_1" type="Const" version="opset1">
902
+ <data element_type="i64" shape="1" offset="1672" size="8" />
903
+ <output>
904
+ <port id="0" precision="I64">
905
+ <dim>1</dim>
906
+ </port>
907
+ </output>
908
+ </layer>
909
+ <layer id="64" name="aten::slice/Slice_1" type="Slice" version="opset8">
910
+ <input>
911
+ <port id="0" precision="FP32">
912
+ <dim>-1</dim>
913
+ <dim>226</dim>
914
+ <dim>32</dim>
915
+ </port>
916
+ <port id="1" precision="I64">
917
+ <dim>1</dim>
918
+ </port>
919
+ <port id="2" precision="I64">
920
+ <dim>1</dim>
921
+ </port>
922
+ <port id="3" precision="I64">
923
+ <dim>1</dim>
924
+ </port>
925
+ <port id="4" precision="I64">
926
+ <dim>1</dim>
927
+ </port>
928
+ </input>
929
+ <output>
930
+ <port id="5" precision="FP32" names="14">
931
+ <dim>-1</dim>
932
+ <dim>225</dim>
933
+ <dim>32</dim>
934
+ </port>
935
+ </output>
936
+ </layer>
937
+ <layer id="65" name="self.multi_modal_projector.linear_1.weight" type="Const" version="opset1">
938
+ <data element_type="f32" shape="16, 32" offset="56596" size="2048" />
939
+ <output>
940
+ <port id="0" precision="FP32" names="self.multi_modal_projector.linear_1.weight">
941
+ <dim>16</dim>
942
+ <dim>32</dim>
943
+ </port>
944
+ </output>
945
+ </layer>
946
+ <layer id="66" name="__module.multi_modal_projector.linear_1/aten::linear/MatMul" type="MatMul" version="opset1">
947
+ <data transpose_a="false" transpose_b="true" />
948
+ <input>
949
+ <port id="0" precision="FP32">
950
+ <dim>-1</dim>
951
+ <dim>225</dim>
952
+ <dim>32</dim>
953
+ </port>
954
+ <port id="1" precision="FP32">
955
+ <dim>16</dim>
956
+ <dim>32</dim>
957
+ </port>
958
+ </input>
959
+ <output>
960
+ <port id="2" precision="FP32" names="195">
961
+ <dim>-1</dim>
962
+ <dim>225</dim>
963
+ <dim>16</dim>
964
+ </port>
965
+ </output>
966
+ </layer>
967
+ <layer id="67" name="__module.multi_modal_projector.act/aten::gelu/Gelu" type="Gelu" version="opset7">
968
+ <data approximation_mode="ERF" />
969
+ <input>
970
+ <port id="0" precision="FP32">
971
+ <dim>-1</dim>
972
+ <dim>225</dim>
973
+ <dim>16</dim>
974
+ </port>
975
+ </input>
976
+ <output>
977
+ <port id="1" precision="FP32" names="196">
978
+ <dim>-1</dim>
979
+ <dim>225</dim>
980
+ <dim>16</dim>
981
+ </port>
982
+ </output>
983
+ </layer>
984
+ <layer id="68" name="self.multi_modal_projector.linear_2.weight" type="Const" version="opset1">
985
+ <data element_type="f32" shape="16, 16" offset="58644" size="1024" />
986
+ <output>
987
+ <port id="0" precision="FP32" names="self.multi_modal_projector.linear_2.weight">
988
+ <dim>16</dim>
989
+ <dim>16</dim>
990
+ </port>
991
+ </output>
992
+ </layer>
993
+ <layer id="69" name="__module.multi_modal_projector.linear_2/aten::linear/Add" type="MatMul" version="opset1">
994
+ <data transpose_a="false" transpose_b="true" />
995
+ <input>
996
+ <port id="0" precision="FP32">
997
+ <dim>-1</dim>
998
+ <dim>225</dim>
999
+ <dim>16</dim>
1000
+ </port>
1001
+ <port id="1" precision="FP32">
1002
+ <dim>16</dim>
1003
+ <dim>16</dim>
1004
+ </port>
1005
+ </input>
1006
+ <output>
1007
+ <port id="2" precision="FP32" names="last_hidden_state">
1008
+ <dim>-1</dim>
1009
+ <dim>225</dim>
1010
+ <dim>16</dim>
1011
+ </port>
1012
+ </output>
1013
+ </layer>
1014
+ <layer id="70" name="Result_11692" type="Result" version="opset1">
1015
+ <input>
1016
+ <port id="0" precision="FP32">
1017
+ <dim>-1</dim>
1018
+ <dim>225</dim>
1019
+ <dim>16</dim>
1020
+ </port>
1021
+ </input>
1022
+ </layer>
1023
+ </layers>
1024
+ <edges>
1025
+ <edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
1026
+ <edge from-layer="1" from-port="0" to-layer="11" to-port="0" />
1027
+ <edge from-layer="2" from-port="0" to-layer="3" to-port="1" />
1028
+ <edge from-layer="3" from-port="2" to-layer="4" to-port="0" />
1029
+ <edge from-layer="3" from-port="2" to-layer="13" to-port="0" />
1030
+ <edge from-layer="4" from-port="1" to-layer="7" to-port="0" />
1031
+ <edge from-layer="5" from-port="0" to-layer="7" to-port="1" />
1032
+ <edge from-layer="6" from-port="0" to-layer="7" to-port="2" />
1033
+ <edge from-layer="7" from-port="3" to-layer="10" to-port="0" />
1034
+ <edge from-layer="8" from-port="0" to-layer="10" to-port="1" />
1035
+ <edge from-layer="9" from-port="0" to-layer="10" to-port="2" />
1036
+ <edge from-layer="10" from-port="3" to-layer="11" to-port="1" />
1037
+ <edge from-layer="11" from-port="2" to-layer="16" to-port="0" />
1038
+ <edge from-layer="12" from-port="0" to-layer="13" to-port="1" />
1039
+ <edge from-layer="13" from-port="2" to-layer="15" to-port="0" />
1040
+ <edge from-layer="14" from-port="0" to-layer="15" to-port="1" />
1041
+ <edge from-layer="15" from-port="2" to-layer="16" to-port="1" />
1042
+ <edge from-layer="16" from-port="2" to-layer="18" to-port="0" />
1043
+ <edge from-layer="17" from-port="0" to-layer="18" to-port="1" />
1044
+ <edge from-layer="18" from-port="2" to-layer="20" to-port="0" />
1045
+ <edge from-layer="19" from-port="0" to-layer="20" to-port="1" />
1046
+ <edge from-layer="20" from-port="2" to-layer="22" to-port="0" />
1047
+ <edge from-layer="20" from-port="2" to-layer="50" to-port="0" />
1048
+ <edge from-layer="21" from-port="0" to-layer="22" to-port="1" />
1049
+ <edge from-layer="22" from-port="2" to-layer="24" to-port="0" />
1050
+ <edge from-layer="22" from-port="2" to-layer="30" to-port="0" />
1051
+ <edge from-layer="22" from-port="2" to-layer="36" to-port="0" />
1052
+ <edge from-layer="23" from-port="0" to-layer="24" to-port="1" />
1053
+ <edge from-layer="24" from-port="2" to-layer="26" to-port="0" />
1054
+ <edge from-layer="25" from-port="0" to-layer="26" to-port="1" />
1055
+ <edge from-layer="26" from-port="2" to-layer="28" to-port="0" />
1056
+ <edge from-layer="27" from-port="0" to-layer="28" to-port="1" />
1057
+ <edge from-layer="28" from-port="2" to-layer="43" to-port="0" />
1058
+ <edge from-layer="29" from-port="0" to-layer="30" to-port="1" />
1059
+ <edge from-layer="30" from-port="2" to-layer="32" to-port="0" />
1060
+ <edge from-layer="31" from-port="0" to-layer="32" to-port="1" />
1061
+ <edge from-layer="32" from-port="2" to-layer="34" to-port="0" />
1062
+ <edge from-layer="33" from-port="0" to-layer="34" to-port="1" />
1063
+ <edge from-layer="34" from-port="2" to-layer="43" to-port="1" />
1064
+ <edge from-layer="35" from-port="0" to-layer="36" to-port="1" />
1065
+ <edge from-layer="36" from-port="2" to-layer="38" to-port="0" />
1066
+ <edge from-layer="37" from-port="0" to-layer="38" to-port="1" />
1067
+ <edge from-layer="38" from-port="2" to-layer="40" to-port="0" />
1068
+ <edge from-layer="39" from-port="0" to-layer="40" to-port="1" />
1069
+ <edge from-layer="40" from-port="2" to-layer="43" to-port="2" />
1070
+ <edge from-layer="41" from-port="0" to-layer="43" to-port="3" />
1071
+ <edge from-layer="42" from-port="0" to-layer="43" to-port="4" />
1072
+ <edge from-layer="43" from-port="5" to-layer="45" to-port="0" />
1073
+ <edge from-layer="44" from-port="0" to-layer="45" to-port="1" />
1074
+ <edge from-layer="45" from-port="2" to-layer="47" to-port="0" />
1075
+ <edge from-layer="46" from-port="0" to-layer="47" to-port="1" />
1076
+ <edge from-layer="47" from-port="2" to-layer="49" to-port="0" />
1077
+ <edge from-layer="48" from-port="0" to-layer="49" to-port="1" />
1078
+ <edge from-layer="49" from-port="2" to-layer="50" to-port="1" />
1079
+ <edge from-layer="50" from-port="2" to-layer="52" to-port="0" />
1080
+ <edge from-layer="50" from-port="2" to-layer="59" to-port="0" />
1081
+ <edge from-layer="51" from-port="0" to-layer="52" to-port="1" />
1082
+ <edge from-layer="52" from-port="2" to-layer="54" to-port="0" />
1083
+ <edge from-layer="53" from-port="0" to-layer="54" to-port="1" />
1084
+ <edge from-layer="54" from-port="2" to-layer="56" to-port="0" />
1085
+ <edge from-layer="55" from-port="0" to-layer="56" to-port="1" />
1086
+ <edge from-layer="56" from-port="2" to-layer="58" to-port="0" />
1087
+ <edge from-layer="57" from-port="0" to-layer="58" to-port="1" />
1088
+ <edge from-layer="58" from-port="2" to-layer="59" to-port="1" />
1089
+ <edge from-layer="59" from-port="2" to-layer="64" to-port="0" />
1090
+ <edge from-layer="60" from-port="0" to-layer="64" to-port="1" />
1091
+ <edge from-layer="61" from-port="0" to-layer="64" to-port="2" />
1092
+ <edge from-layer="62" from-port="0" to-layer="64" to-port="3" />
1093
+ <edge from-layer="63" from-port="0" to-layer="64" to-port="4" />
1094
+ <edge from-layer="64" from-port="5" to-layer="66" to-port="0" />
1095
+ <edge from-layer="65" from-port="0" to-layer="66" to-port="1" />
1096
+ <edge from-layer="66" from-port="2" to-layer="67" to-port="0" />
1097
+ <edge from-layer="67" from-port="1" to-layer="69" to-port="0" />
1098
+ <edge from-layer="68" from-port="0" to-layer="69" to-port="1" />
1099
+ <edge from-layer="69" from-port="2" to-layer="70" to-port="0" />
1100
+ </edges>
1101
+ <rt_info>
1102
+ <Runtime_version value="2024.5.0-17202-a7ccc5e0efc" />
1103
+ <conversion_parameters>
1104
+ <framework value="pytorch" />
1105
+ <is_python_object value="True" />
1106
+ </conversion_parameters>
1107
+ <optimum>
1108
+ <optimum_intel_version value="1.20.0.dev0+7cc52a7" />
1109
+ <optimum_version value="1.23.2" />
1110
+ <pytorch_version value="2.5.1" />
1111
+ <transformers_version value="4.45.2" />
1112
+ </optimum>
1113
+ </rt_info>
1114
+ </net>
preprocessor_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 30,
4
+ "width": 30
5
+ },
6
+ "do_center_crop": true,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "image_mean": [
12
+ 0.48145466,
13
+ 0.4578275,
14
+ 0.40821073
15
+ ],
16
+ "image_processor_type": "CLIPImageProcessor",
17
+ "image_std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ],
22
+ "processor_class": "LlavaProcessor",
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "shortest_edge": 30
27
+ }
28
+ }
processor_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "image_token": "<image>",
3
+ "patch_size": null,
4
+ "processor_class": "LlavaProcessor",
5
+ "vision_feature_select_strategy": null
6
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<pad>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<unk>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
tokenizer_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "<image>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "32001": {
39
+ "content": "<pad>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ }
46
+ },
47
+ "bos_token": "<s>",
48
+ "clean_up_tokenization_spaces": false,
49
+ "eos_token": "</s>",
50
+ "legacy": false,
51
+ "model_max_length": 1000000000000000019884624838656,
52
+ "pad_token": "<pad>",
53
+ "padding_side": "left",
54
+ "processor_class": "LlavaProcessor",
55
+ "sp_model_kwargs": {},
56
+ "tokenizer_class": "LlamaTokenizer",
57
+ "trust_remote_code": false,
58
+ "unk_token": "<unk>",
59
+ "use_default_system_prompt": false
60
+ }