import os import typing import argparse import numpy as np import torch from gguf import * from safetensors import safe_open def k(raw_key: str, arch: str) -> str: return raw_key.format(arch=arch) class Args: def __init__(self, model, output): self.model = model self.output = output class SafetensorsIndexFile(typing.TypedDict): weight_map: typing.Dict[str, str] class SafetensorsIndex: def __init__(self, index_file_path: str): directory = os.path.dirname(index_file_path) self.index = typing.cast(SafetensorsIndexFile, json.load(open(index_file_path))) self.weight_map = self.index["weight_map"] files = set(self.weight_map.values()) self.tensors = {file: safe_open(os.path.join(directory, file), framework="pt") for file in files} def get_tensor(self, key: str) -> npt.NDArray[np.float32]: # convert to float32 and cast to np array return typing.cast(npt.NDArray[np.float32], self.tensors[self.weight_map[key]].get_tensor(key).to(torch.float32).numpy()) def main(): parser = argparse.ArgumentParser(description="Extract vision model from safetensors to GGUF") parser.add_argument("--model", type=str, required=True, help="Input safetensors file") parser.add_argument("--output", type=str, required=True, help="Output GGUF file") args = parser.parse_args() import pathlib dir_model = pathlib.Path(args.model) config = json.load(open(dir_model / "config.json")) # tensors = safe_open(args.model, framework="np", device="cpu") tensors = SafetensorsIndex((dir_model / "model.safetensors.index.json").as_posix()) ftype = 1 # fp16 # source https://github.com/huggingface/transformers/blob/87134662f73d5e89bb015531ddd1d4662371d317/src/transformers/models/clip/configuration_clip.py#L209 # hidden_size=768, # intermediate_size=3072, # projection_dim=512, # num_hidden_layers=12, # num_attention_heads=12, # num_channels=3, # image_size=224, # patch_size=32, # hidden_act="quick_gelu", # layer_norm_eps=1e-5, # attention_dropout=0.0, # initializer_range=0.02, # initializer_factor=1.0, clip_vision_config = { "hidden_size": 768, "intermediate_size": 3072, "projection_dim": 512, "num_hidden_layers": 12, "num_attention_heads": 12, "num_channels": 3, "image_size": 224, "patch_size": 32, "hidden_act": "quick_gelu", "layer_norm_eps": 1e-5, "attention_dropout": 0.0, "initializer_range": 0.02, "initializer_factor": 1.0, } # CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig( # attention_dropout=0.0, # dropout=0.0, # hidden_act="quick_gelu", # hidden_size=1024, # image_size=336, # initializer_factor=1.0, # initializer_range=0.02, # intermediate_size=4096, # layer_norm_eps=1e-05, # num_attention_heads=16, # num_channels=3, # num_hidden_layers=24, # patch_size=14, # projection_dim=768 # ) clip_vision_config.update(dict( attention_dropout=0.0, dropout=0.0, hidden_act="quick_gelu", hidden_size=1024, image_size=336, initializer_factor=1.0, initializer_range=0.02, intermediate_size=4096, layer_norm_eps=1e-05, num_attention_heads=16, num_channels=3, num_hidden_layers=24, patch_size=14, projection_dim=768 )) fout = GGUFWriter(args.output, arch="clip") fout.add_bool("clip.has_text_encoder", False) fout.add_bool("clip.has_vision_encoder", True) fout.add_bool("clip.has_llava_projector", True) fout.add_file_type(ftype) model_name = "microsoft/phi-3.5-vision-instruct" fout.add_name(model_name) fout.add_description("image encoder for " + model_name) fout.add_string("clip.projector_type", "mlp") # Vision model hparams VISION = "clip.vision" fout.add_uint32("clip.vision.image_size", clip_vision_config["image_size"]) fout.add_uint32("clip.vision.patch_size", clip_vision_config["patch_size"]) fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), clip_vision_config["hidden_size"]) fout.add_uint32(k(KEY_FEED_FORWARD_LENGTH, VISION), clip_vision_config["intermediate_size"]) fout.add_uint32("clip.vision.projection_dim", clip_vision_config["projection_dim"]) fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), clip_vision_config["num_attention_heads"]) fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), clip_vision_config["layer_norm_eps"]) fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), clip_vision_config["num_hidden_layers"]) fout.add_array("clip.vision.image_mean", [0.48145466, 0.4578275, 0.40821073]) fout.add_array("clip.vision.image_std", [0.26862954, 0.26130258, 0.27577711]) fout.add_bool("clip.use_gelu", clip_vision_config["hidden_act"] != "quick_gelu") # Vision model tensors prefix = "model.vision_embed_tokens.img_processor.vision_model." fout.add_tensor( "v.class_embd", tensors.get_tensor(f"{prefix}embeddings.class_embedding").astype(np.float32), ) fout.add_tensor( "v.patch_embd.weight", tensors.get_tensor(f"{prefix}embeddings.patch_embedding.weight") .reshape(clip_vision_config["hidden_size"], 3, clip_vision_config["patch_size"], clip_vision_config["patch_size"]) .astype(np.float16), ) fout.add_tensor( "v.position_embd.weight", tensors.get_tensor(f"{prefix}embeddings.position_embedding.weight").astype(np.float16), ) fout.add_tensor( "v.sub_gn", tensors.get_tensor("model.vision_embed_tokens.sub_GN").astype(np.float32), ) fout.add_tensor( "v.glb_gn", tensors.get_tensor("model.vision_embed_tokens.glb_GN").astype(np.float32), ) fout.add_tensor( "mm.0.weight", tensors.get_tensor("model.vision_embed_tokens.img_projection.0.weight").astype(np.float16), ) fout.add_tensor( "mm.0.bias", tensors.get_tensor("model.vision_embed_tokens.img_projection.0.bias").astype(np.float32), ) fout.add_tensor( "mm.2.weight", tensors.get_tensor("model.vision_embed_tokens.img_projection.2.weight").astype(np.float16), ) fout.add_tensor( "mm.2.bias", tensors.get_tensor("model.vision_embed_tokens.img_projection.2.bias").astype(np.float32), ) for i in range(clip_vision_config["num_hidden_layers"]): # attention norm fout.add_tensor( f"v.blk.{i}.attn_norm.weight", tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.weight").astype(np.float32), ) fout.add_tensor( f"v.blk.{i}.attn_norm.bias", tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.bias").astype(np.float32), ) fout.add_tensor( f"v.blk.{i}.ffn_norm.weight", tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.weight").astype(np.float32), ) fout.add_tensor( f"v.blk.{i}.ffn_norm.bias", tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.bias").astype(np.float32), ) # feed forward fout.add_tensor( f"v.blk.{i}.ffn_down.weight", tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.weight").astype(np.float16), ) fout.add_tensor( f"v.blk.{i}.ffn_down.bias", tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc1.bias").astype(np.float32), ) fout.add_tensor( f"v.blk.{i}.ffn_up.weight", tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.weight").astype(np.float16), ) fout.add_tensor( f"v.blk.{i}.ffn_up.bias", tensors.get_tensor(f"{prefix}encoder.layers.{i}.mlp.fc2.bias").astype(np.float32), ) # attention fout.add_tensor( f"v.blk.{i}.attn_k.weight", tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.weight").astype(np.float16), ) fout.add_tensor( f"v.blk.{i}.attn_k.bias", tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.k_proj.bias").astype(np.float32), ) fout.add_tensor( f"v.blk.{i}.attn_out.weight", tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.weight").astype(np.float16), ) fout.add_tensor( f"v.blk.{i}.attn_out.bias", tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.out_proj.bias").astype(np.float32), ) fout.add_tensor( f"v.blk.{i}.attn_q.weight", tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.weight").astype(np.float16), ) fout.add_tensor( f"v.blk.{i}.attn_q.bias", tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.q_proj.bias").astype(np.float32), ) fout.add_tensor( f"v.blk.{i}.attn_v.weight", tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.weight").astype(np.float16), ) fout.add_tensor( f"v.blk.{i}.attn_v.bias", tensors.get_tensor(f"{prefix}encoder.layers.{i}.self_attn.v_proj.bias").astype(np.float32), ) # layer norm fout.add_tensor( f"v.blk.{i}.ln1.weight", tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.weight").astype(np.float32), ) fout.add_tensor( f"v.blk.{i}.ln1.bias", tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm1.bias").astype(np.float32), ) fout.add_tensor( f"v.blk.{i}.ln2.weight", tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.weight").astype(np.float32), ) fout.add_tensor( f"v.blk.{i}.ln2.bias", tensors.get_tensor(f"{prefix}encoder.layers.{i}.layer_norm2.bias").astype(np.float32), ) fout.add_tensor( "v.post_ln.weight", tensors.get_tensor(f"{prefix}post_layernorm.weight").astype(np.float32), ) fout.add_tensor( "v.post_ln.bias", tensors.get_tensor(f"{prefix}post_layernorm.bias").astype(np.float32), ) fout.add_tensor( "v.pre_ln.weight", tensors.get_tensor(f"{prefix}pre_layrnorm.weight").astype(np.float32), ) fout.add_tensor( "v.pre_ln.bias", tensors.get_tensor(f"{prefix}pre_layrnorm.bias").astype(np.float32), ) fout.write_header_to_file() fout.write_kv_data_to_file() fout.write_tensors_to_file() fout.close() if __name__ == "__main__": main()