abetlen commited on
Commit
260f230
1 Parent(s): fb40b94

Add conversion script

Browse files
Files changed (1) hide show
  1. convert_image_gguf.py +243 -0
convert_image_gguf.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import typing
3
+ import argparse
4
+ import numpy as np
5
+ import torch
6
+ from gguf import *
7
+ from safetensors import safe_open
8
+
9
+ def k(raw_key: str, arch: str) -> str:
10
+ return raw_key.format(arch=arch)
11
+
12
+ class Args:
13
+ def __init__(self, model, output):
14
+ self.model = model
15
+ self.output = output
16
+
17
+ class SafetensorsIndexFile(typing.TypedDict):
18
+ weight_map: typing.Dict[str, str]
19
+
20
+ class SafetensorsIndex:
21
+ def __init__(self, index_file_path: str):
22
+ directory = os.path.dirname(index_file_path)
23
+ self.index = typing.cast(SafetensorsIndexFile, json.load(open(index_file_path)))
24
+ self.weight_map = self.index["weight_map"]
25
+ files = set(self.weight_map.values())
26
+ self.tensors = {file: safe_open(os.path.join(directory, file), framework="pt") for file in files}
27
+
28
+ def get_tensor(self, key: str) -> npt.NDArray[np.float32]:
29
+ # convert to float32 and cast to np array
30
+ return typing.cast(npt.NDArray[np.float32], self.tensors[self.weight_map[key]].get_tensor(key).to(torch.float32).numpy())
31
+
32
+ def main():
33
+ parser = argparse.ArgumentParser(description="Extract vision model from safetensors to GGUF")
34
+ parser.add_argument("--model", type=str, required=True, help="Input safetensors file")
35
+ parser.add_argument("--output", type=str, required=True, help="Output GGUF file")
36
+ args = parser.parse_args()
37
+
38
+ import pathlib
39
+ dir_model = pathlib.Path(args.model)
40
+ config = json.load(open(dir_model / "config.json"))
41
+
42
+ # tensors = safe_open(args.model, framework="np", device="cpu")
43
+ tensors = SafetensorsIndex((dir_model / "model.safetensors.index.json").as_posix())
44
+
45
+ ftype = 1 # fp16
46
+
47
+ # hidden_size=768,
48
+ # intermediate_size=3072,
49
+ # projection_dim=512,
50
+ # num_hidden_layers=12,
51
+ # num_attention_heads=12,
52
+ # num_channels=3,
53
+ # image_size=224,
54
+ # patch_size=32,
55
+ # hidden_act="quick_gelu",
56
+ # layer_norm_eps=1e-5,
57
+ # attention_dropout=0.0,
58
+ # initializer_range=0.02,
59
+ # initializer_factor=1.0,
60
+ clip_vision_config = {
61
+ "hidden_size": 1024,
62
+ "intermediate_size": 4096,
63
+ "projection_dim": 1024,
64
+ "num_hidden_layers": 24,
65
+ "num_attention_heads": 16,
66
+ "num_channels": 3,
67
+ "image_size": 224,
68
+ "patch_size": 14,
69
+ "hidden_act": "quick_gelu",
70
+ "layer_norm_eps": 1e-5,
71
+ "attention_dropout": 0.0,
72
+ "initializer_range": 0.02,
73
+ "initializer_factor": 1.0,
74
+ }
75
+ # CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
76
+ # attention_dropout=0.0,
77
+ # dropout=0.0,
78
+ # hidden_act="quick_gelu",
79
+ # hidden_size=1024,
80
+ # image_size=336,
81
+ # initializer_factor=1.0,
82
+ # initializer_range=0.02,
83
+ # intermediate_size=4096,
84
+ # layer_norm_eps=1e-05,
85
+ # num_attention_heads=16,
86
+ # num_channels=3,
87
+ # num_hidden_layers=24,
88
+ # patch_size=14,
89
+ # projection_dim=768
90
+ # )
91
+ clip_vision_config.update(dict(
92
+ attention_dropout=0.0,
93
+ dropout=0.0,
94
+ hidden_act="quick_gelu",
95
+ hidden_size=1024,
96
+ image_size=224,
97
+ initializer_factor=1.0,
98
+ initializer_range=0.02,
99
+ intermediate_size=4096,
100
+ layer_norm_eps=1e-05,
101
+ num_attention_heads=16,
102
+ num_channels=3,
103
+ num_hidden_layers=24,
104
+ patch_size=14,
105
+ projection_dim=1024
106
+ ))
107
+
108
+
109
+
110
+ fout = GGUFWriter(args.output, arch="clip")
111
+
112
+ fout.add_bool("clip.has_text_encoder", False)
113
+ fout.add_bool("clip.has_vision_encoder", True)
114
+ fout.add_bool("clip.has_llava_projector", True)
115
+ fout.add_file_type(ftype)
116
+
117
+ model_name = "microsoft/phi-3.5-vision-instruct"
118
+ fout.add_name(model_name)
119
+ fout.add_description("image encoder for " + model_name)
120
+ fout.add_string("clip.projector_type", "mlp")
121
+
122
+ # Vision model hparams
123
+ VISION = "clip.vision"
124
+ fout.add_uint32("clip.vision.image_size", clip_vision_config["image_size"])
125
+ fout.add_uint32("clip.vision.patch_size", clip_vision_config["patch_size"])
126
+ fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), clip_vision_config["hidden_size"])
127
+ fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), clip_vision_config["intermediate_size"])
128
+ fout.add_uint32("clip.vision.projection_dim", clip_vision_config["projection_dim"])
129
+ fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), clip_vision_config["num_attention_heads"])
130
+ fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), clip_vision_config["layer_norm_eps"])
131
+ fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), clip_vision_config["num_hidden_layers"])
132
+
133
+ fout.add_array("clip.vision.image_mean", [0.48145466, 0.4578275, 0.40821073])
134
+ fout.add_array("clip.vision.image_std", [0.26862954, 0.26130258, 0.27577711])
135
+
136
+ # Vision model tensors
137
+ prefix = "model.vision_embed_tokens.img_processor.vision_model."
138
+
139
+ fout.add_tensor(
140
+ "v.class_embd",
141
+ tensors.get_tensor(f"{prefix}embeddings.class_embedding").astype(np.float16),
142
+ )
143
+ fout.add_tensor(
144
+ "v.patch_embd.weight",
145
+ tensors.get_tensor(f"{prefix}embeddings.patch_embedding.weight").reshape(1024, 3, 14, 14).astype(np.float16),
146
+ )
147
+ fout.add_tensor(
148
+ "v.position_embd.weight",
149
+ tensors.get_tensor(f"{prefix}embeddings.position_embedding.weight").astype(np.float16),
150
+ )
151
+
152
+ fout.add_tensor(
153
+ "v.sub_GN",
154
+ tensors.get_tensor("model.vision_embed_tokens.sub_GN").astype(np.float32),
155
+ )
156
+ fout.add_tensor(
157
+ "v.glb_GN",
158
+ tensors.get_tensor("model.vision_embed_tokens.glb_GN").astype(np.float32),
159
+ )
160
+
161
+ for i in range(clip_vision_config["num_hidden_layers"]):
162
+ # layer norm
163
+ fout.add_tensor(
164
+ f"blk.{i}.attn_norm.weight",
165
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.weight").astype(np.float32),
166
+ )
167
+ fout.add_tensor(
168
+ f"blk.{i}.attn_norm.bias",
169
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.bias").astype(np.float32),
170
+ )
171
+ fout.add_tensor(
172
+ f"blk.{i}.ffn_norm.weight",
173
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.weight").astype(np.float32),
174
+ )
175
+ fout.add_tensor(
176
+ f"blk.{i}.ffn_norm.bias",
177
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.bias").astype(np.float32),
178
+ )
179
+
180
+ # feed forward
181
+ fout.add_tensor(
182
+ f"blk.{i}.ffn_down.weight",
183
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.weight").astype(np.float16),
184
+ )
185
+ fout.add_tensor(
186
+ f"blk.{i}.ffn_down.bias",
187
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.bias").astype(np.float16),
188
+ )
189
+ fout.add_tensor(
190
+ f"blk.{i}.ffn_up.weight",
191
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.weight").astype(np.float16),
192
+ )
193
+ fout.add_tensor(
194
+ f"blk.{i}.ffn_up.bias",
195
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.bias").astype(np.float16),
196
+ )
197
+
198
+ # attention
199
+ fout.add_tensor(
200
+ f"blk.{i}.attn_k.weight",
201
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.weight").astype(np.float16),
202
+ )
203
+ fout.add_tensor(
204
+ f"blk.{i}.attn_k.bias",
205
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.bias").astype(np.float16),
206
+ )
207
+ fout.add_tensor(
208
+ f"blk.{i}.attn_output.weight",
209
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.weight").astype(np.float16),
210
+ )
211
+ fout.add_tensor(
212
+ f"blk.{i}.attn_output.bias",
213
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.bias").astype(np.float16),
214
+ )
215
+ fout.add_tensor(
216
+ f"blk.{i}.attn_q.weight",
217
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.weight").astype(np.float16),
218
+ )
219
+ fout.add_tensor(
220
+ f"blk.{i}.attn_q.bias",
221
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.bias").astype(np.float16),
222
+ )
223
+ fout.add_tensor(
224
+ f"blk.{i}.attn_v.weight",
225
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.weight").astype(np.float16),
226
+ )
227
+ fout.add_tensor(
228
+ f"blk.{i}.attn_v.bias",
229
+ tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.bias").astype(np.float16),
230
+ )
231
+
232
+ fout.add_tensor(
233
+ "output_norm.weight",
234
+ tensors.get_tensor(f"{prefix}post_layernorm.weight").astype(np.float32),
235
+ )
236
+
237
+ fout.write_header_to_file()
238
+ fout.write_kv_data_to_file()
239
+ fout.write_tensors_to_file()
240
+ fout.close()
241
+
242
+ if __name__ == "__main__":
243
+ main()