Update convert script
Browse files- convert_image_gguf.py +79 -26
convert_image_gguf.py
CHANGED
@@ -133,16 +133,20 @@ def main():
|
|
133 |
fout.add_array("clip.vision.image_mean", [0.48145466, 0.4578275, 0.40821073])
|
134 |
fout.add_array("clip.vision.image_std", [0.26862954, 0.26130258, 0.27577711])
|
135 |
|
|
|
|
|
136 |
# Vision model tensors
|
137 |
prefix = "model.vision_embed_tokens.img_processor.vision_model."
|
138 |
|
139 |
fout.add_tensor(
|
140 |
"v.class_embd",
|
141 |
-
tensors.get_tensor(f"{prefix}embeddings.class_embedding").astype(np.
|
142 |
)
|
143 |
fout.add_tensor(
|
144 |
"v.patch_embd.weight",
|
145 |
-
tensors.get_tensor(f"{prefix}embeddings.patch_embedding.weight")
|
|
|
|
|
146 |
)
|
147 |
fout.add_tensor(
|
148 |
"v.position_embd.weight",
|
@@ -158,81 +162,130 @@ def main():
|
|
158 |
tensors.get_tensor("model.vision_embed_tokens.glb_GN").astype(np.float32),
|
159 |
)
|
160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
for i in range(clip_vision_config["num_hidden_layers"]):
|
162 |
-
#
|
163 |
fout.add_tensor(
|
164 |
-
f"blk.{i}.attn_norm.weight",
|
165 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.weight").astype(np.float32),
|
166 |
)
|
167 |
fout.add_tensor(
|
168 |
-
f"blk.{i}.attn_norm.bias",
|
169 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.bias").astype(np.float32),
|
170 |
)
|
171 |
fout.add_tensor(
|
172 |
-
f"blk.{i}.ffn_norm.weight",
|
173 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.weight").astype(np.float32),
|
174 |
)
|
175 |
fout.add_tensor(
|
176 |
-
f"blk.{i}.ffn_norm.bias",
|
177 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.bias").astype(np.float32),
|
178 |
)
|
179 |
|
180 |
# feed forward
|
181 |
fout.add_tensor(
|
182 |
-
f"blk.{i}.ffn_down.weight",
|
183 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.weight").astype(np.float16),
|
184 |
)
|
185 |
fout.add_tensor(
|
186 |
-
f"blk.{i}.ffn_down.bias",
|
187 |
-
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.bias").astype(np.
|
188 |
)
|
189 |
fout.add_tensor(
|
190 |
-
f"blk.{i}.ffn_up.weight",
|
191 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.weight").astype(np.float16),
|
192 |
)
|
193 |
fout.add_tensor(
|
194 |
-
f"blk.{i}.ffn_up.bias",
|
195 |
-
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.bias").astype(np.
|
196 |
)
|
197 |
|
198 |
# attention
|
199 |
fout.add_tensor(
|
200 |
-
f"blk.{i}.attn_k.weight",
|
201 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.weight").astype(np.float16),
|
202 |
)
|
203 |
fout.add_tensor(
|
204 |
-
f"blk.{i}.attn_k.bias",
|
205 |
-
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.bias").astype(np.
|
206 |
)
|
207 |
fout.add_tensor(
|
208 |
-
f"blk.{i}.
|
209 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.weight").astype(np.float16),
|
210 |
)
|
211 |
fout.add_tensor(
|
212 |
-
f"blk.{i}.
|
213 |
-
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.bias").astype(np.
|
214 |
)
|
215 |
fout.add_tensor(
|
216 |
-
f"blk.{i}.attn_q.weight",
|
217 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.weight").astype(np.float16),
|
218 |
)
|
219 |
fout.add_tensor(
|
220 |
-
f"blk.{i}.attn_q.bias",
|
221 |
-
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.bias").astype(np.
|
222 |
)
|
223 |
fout.add_tensor(
|
224 |
-
f"blk.{i}.attn_v.weight",
|
225 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.weight").astype(np.float16),
|
226 |
)
|
227 |
fout.add_tensor(
|
228 |
-
f"blk.{i}.attn_v.bias",
|
229 |
-
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.bias").astype(np.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
)
|
231 |
|
232 |
fout.add_tensor(
|
233 |
-
"
|
234 |
tensors.get_tensor(f"{prefix}post_layernorm.weight").astype(np.float32),
|
235 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
fout.write_header_to_file()
|
238 |
fout.write_kv_data_to_file()
|
|
|
133 |
fout.add_array("clip.vision.image_mean", [0.48145466, 0.4578275, 0.40821073])
|
134 |
fout.add_array("clip.vision.image_std", [0.26862954, 0.26130258, 0.27577711])
|
135 |
|
136 |
+
fout.add_bool("clip.use_gelu", clip_vision_config["hidden_act"] != "quick_gelu")
|
137 |
+
|
138 |
# Vision model tensors
|
139 |
prefix = "model.vision_embed_tokens.img_processor.vision_model."
|
140 |
|
141 |
fout.add_tensor(
|
142 |
"v.class_embd",
|
143 |
+
tensors.get_tensor(f"{prefix}embeddings.class_embedding").astype(np.float32),
|
144 |
)
|
145 |
fout.add_tensor(
|
146 |
"v.patch_embd.weight",
|
147 |
+
tensors.get_tensor(f"{prefix}embeddings.patch_embedding.weight")
|
148 |
+
.reshape(clip_vision_config["hidden_size"], 3, clip_vision_config["patch_size"], clip_vision_config["patch_size"])
|
149 |
+
.astype(np.float16),
|
150 |
)
|
151 |
fout.add_tensor(
|
152 |
"v.position_embd.weight",
|
|
|
162 |
tensors.get_tensor("model.vision_embed_tokens.glb_GN").astype(np.float32),
|
163 |
)
|
164 |
|
165 |
+
fout.add_tensor(
|
166 |
+
"mm.0.weight",
|
167 |
+
tensors.get_tensor("model.vision_embed_tokens.img_projection.0.weight").astype(np.float16),
|
168 |
+
)
|
169 |
+
fout.add_tensor(
|
170 |
+
"mm.0.bias",
|
171 |
+
tensors.get_tensor("model.vision_embed_tokens.img_projection.0.bias").astype(np.float32),
|
172 |
+
)
|
173 |
+
|
174 |
+
fout.add_tensor(
|
175 |
+
"mm.2.weight",
|
176 |
+
tensors.get_tensor("model.vision_embed_tokens.img_projection.2.weight").astype(np.float16),
|
177 |
+
)
|
178 |
+
fout.add_tensor(
|
179 |
+
"mm.2.bias",
|
180 |
+
tensors.get_tensor("model.vision_embed_tokens.img_projection.2.bias").astype(np.float32),
|
181 |
+
)
|
182 |
+
|
183 |
for i in range(clip_vision_config["num_hidden_layers"]):
|
184 |
+
# attention norm
|
185 |
fout.add_tensor(
|
186 |
+
f"v.blk.{i}.attn_norm.weight",
|
187 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.weight").astype(np.float32),
|
188 |
)
|
189 |
fout.add_tensor(
|
190 |
+
f"v.blk.{i}.attn_norm.bias",
|
191 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.bias").astype(np.float32),
|
192 |
)
|
193 |
fout.add_tensor(
|
194 |
+
f"v.blk.{i}.ffn_norm.weight",
|
195 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.weight").astype(np.float32),
|
196 |
)
|
197 |
fout.add_tensor(
|
198 |
+
f"v.blk.{i}.ffn_norm.bias",
|
199 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.bias").astype(np.float32),
|
200 |
)
|
201 |
|
202 |
# feed forward
|
203 |
fout.add_tensor(
|
204 |
+
f"v.blk.{i}.ffn_down.weight",
|
205 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.weight").astype(np.float16),
|
206 |
)
|
207 |
fout.add_tensor(
|
208 |
+
f"v.blk.{i}.ffn_down.bias",
|
209 |
+
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.bias").astype(np.float32),
|
210 |
)
|
211 |
fout.add_tensor(
|
212 |
+
f"v.blk.{i}.ffn_up.weight",
|
213 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.weight").astype(np.float16),
|
214 |
)
|
215 |
fout.add_tensor(
|
216 |
+
f"v.blk.{i}.ffn_up.bias",
|
217 |
+
tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.bias").astype(np.float32),
|
218 |
)
|
219 |
|
220 |
# attention
|
221 |
fout.add_tensor(
|
222 |
+
f"v.blk.{i}.attn_k.weight",
|
223 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.weight").astype(np.float16),
|
224 |
)
|
225 |
fout.add_tensor(
|
226 |
+
f"v.blk.{i}.attn_k.bias",
|
227 |
+
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.bias").astype(np.float32),
|
228 |
)
|
229 |
fout.add_tensor(
|
230 |
+
f"v.blk.{i}.attn_out.weight",
|
231 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.weight").astype(np.float16),
|
232 |
)
|
233 |
fout.add_tensor(
|
234 |
+
f"v.blk.{i}.attn_out.bias",
|
235 |
+
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.bias").astype(np.float32),
|
236 |
)
|
237 |
fout.add_tensor(
|
238 |
+
f"v.blk.{i}.attn_q.weight",
|
239 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.weight").astype(np.float16),
|
240 |
)
|
241 |
fout.add_tensor(
|
242 |
+
f"v.blk.{i}.attn_q.bias",
|
243 |
+
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.bias").astype(np.float32),
|
244 |
)
|
245 |
fout.add_tensor(
|
246 |
+
f"v.blk.{i}.attn_v.weight",
|
247 |
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.weight").astype(np.float16),
|
248 |
)
|
249 |
fout.add_tensor(
|
250 |
+
f"v.blk.{i}.attn_v.bias",
|
251 |
+
tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.bias").astype(np.float32),
|
252 |
+
)
|
253 |
+
|
254 |
+
# layer norm
|
255 |
+
fout.add_tensor(
|
256 |
+
f"v.blk.{i}.ln1.weight",
|
257 |
+
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.weight").astype(np.float32),
|
258 |
+
)
|
259 |
+
fout.add_tensor(
|
260 |
+
f"v.blk.{i}.ln1.bias",
|
261 |
+
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.bias").astype(np.float32),
|
262 |
+
)
|
263 |
+
fout.add_tensor(
|
264 |
+
f"v.blk.{i}.ln2.weight",
|
265 |
+
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.weight").astype(np.float32),
|
266 |
+
)
|
267 |
+
fout.add_tensor(
|
268 |
+
f"v.blk.{i}.ln2.bias",
|
269 |
+
tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.bias").astype(np.float32),
|
270 |
)
|
271 |
|
272 |
fout.add_tensor(
|
273 |
+
"v.post_ln.weight",
|
274 |
tensors.get_tensor(f"{prefix}post_layernorm.weight").astype(np.float32),
|
275 |
)
|
276 |
+
fout.add_tensor(
|
277 |
+
"v.post_ln.bias",
|
278 |
+
tensors.get_tensor(f"{prefix}post_layernorm.bias").astype(np.float32),
|
279 |
+
)
|
280 |
+
|
281 |
+
fout.add_tensor(
|
282 |
+
"v.pre_ln.weight",
|
283 |
+
tensors.get_tensor(f"{prefix}pre_layrnorm.weight").astype(np.float32),
|
284 |
+
)
|
285 |
+
fout.add_tensor(
|
286 |
+
"v.pre_ln.bias",
|
287 |
+
tensors.get_tensor(f"{prefix}pre_layrnorm.bias").astype(np.float32),
|
288 |
+
)
|
289 |
|
290 |
fout.write_header_to_file()
|
291 |
fout.write_kv_data_to_file()
|