Spaces:

neiths
/

llama-3-vision-gguf

Configuration error

App Files Files Community

neiths commited on Jul 2, 2024

Commit

37346c1

verified ·

1 Parent(s): cf0e3dd

add some important files

Browse files

Files changed (8) hide show

README.md +47 -10
__main__.py +225 -0
assets/Im5.jpg +0 -0
assets/demo-1.jpg +0 -0
assets/demo-2.jpg +0 -0
assets/demo-3.jpg +0 -0
mm_projector.bin +3 -0
requirements.txt +5 -0

README.md CHANGED Viewed

@@ -1,13 +1,50 @@
 ---
-title: Llama 3 Vision Gguf
-emoji: 💻
-colorFrom: indigo
-colorTo: purple
-sdk: streamlit
-sdk_version: 1.36.0
-app_file: app.py
-pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+language:
+- en
+license: apache-2.0
 ---
+# FOR HF VERSION: https://huggingface.co/qresearch/llama-3-vision-alpha-hf
+# llama3-vision-alpha
+projection module trained to add vision capabilties to Llama 3 using SigLIP. built by [@yeswondwerr](https://x.com/yeswondwerr) and [@qtnx_](https://x.com/qtnx_)
+**usage**
+```
+pip install -r requirements.txt
+```
+```
+python __main__.py -i image_path
+```
+**examples**
+| Image | Examples |
+| --- | --- |
+| <img src="assets/demo-1.jpg" width="200"/> | **What is the title of this book? answer briefly**<br>The title of the book is "The Little Book of Deep Learning".<br><br>**Where is the person standing? answer briefly**<br>The person is standing on the balcony.<br><br>**Describe the image**<br>The image shows a person holding a book with a cityscape visible through the window behind them. The book has a cover with a title that reads "The Little Book of Deep Learning" in bold letters. |
+| <img src="assets/demo-2.jpg" width="200"/> | **What type of food is the girl holding? answer briefly**<br>A hamburger!<br><br>**What color is the woman's hair? answer briefly**<br>It's white!<br><br>**Describe the image**<br>The image is of a young girl with short, curly hair and a sweet smile, holding a giant hamburger in her hand. She's sitting at a table with a festive dinner setting, surrounded by candles and a warm glow. Her eyes are shining with excitement and contentment as she takes a big bite of the burger. |
+**acknowledgements**
+- Liu et al. : [LLaVA](https://arxiv.org/abs/2304.08485)
+- Moon et al. : [AnyMAL](https://arxiv.org/abs/2309.16058)
+- vikhyatk : moondream, test images
+```
+                                       .x+=:.
+                                      z`    ^%                                                  .uef^"
+               .u    .                   .   <k                           .u    .             :d88E
+    .u@u     .d88B :@8c       .u       .@8Ned8"      .u          u      .d88B :@8c        .   `888E
+ .zWF8888bx ="8888f8888r   ud8888.   .@^%8888"    ud8888.     us888u.  ="8888f8888r  .udR88N   888E .z8k
+.888  9888    4888>'88"  :888'8888. x88:  `)8b. :888'8888. .@88 "8888"   4888>'88"  <888'888k  888E~?888L
+I888  9888    4888> '    d888 '88%" 8888N=*8888 d888 '88%" 9888  9888    4888> '    9888 'Y"   888E  888E
+I888  9888    4888>      8888.+"     %8"    R88 8888.+"    9888  9888    4888>      9888       888E  888E
+I888  9888   .d888L .+   8888L        @8Wou 9%  8888L      9888  9888   .d888L .+   9888       888E  888E
+`888Nx?888   ^"8888*"    '8888c. .+ .888888P`   '8888c. .+ 9888  9888   ^"8888*"    ?8888u../  888E  888E
+ "88" '888      "Y"       "88888%   `   ^"F      "88888%   "888*""888"     "Y"       "8888P'  m888N= 888>
+       88E                  "YP'                   "YP'     ^Y"   ^Y'                  "P'     `Y"   888
+       98>                                                                                          J88"
+       '8                                                                                           @%
+        `                                                                                         :"
+```

__main__.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import argparse
+import sys
+import torch
+import torch.nn as nn
+from PIL import Image
+from transformers import (
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    LlamaForCausalLM,
+    SiglipImageProcessor,
+    SiglipVisionModel,
+)
+from transformers import TextStreamer
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def tokenizer_image_token(prompt, tokenizer, image_token_index=-200):
+    prompt_chunks = prompt.split("<image>")
+    tokenized_chunks = [tokenizer(chunk).input_ids for chunk in prompt_chunks]
+    input_ids = tokenized_chunks[0]
+    for chunk in tokenized_chunks[1:]:
+        input_ids.append(image_token_index)
+        input_ids.extend(chunk[1:])  # Exclude BOS token on nonzero index
+    return torch.tensor(input_ids, dtype=torch.long)
+def process_tensors(input_ids, image_features, embedding_layer):
+    # Find the index of -200 in input_ids
+    split_index = (input_ids == -200).nonzero(as_tuple=True)[1][0]
+    # Split the input_ids at the index found, excluding -200
+    input_ids_1 = input_ids[:, :split_index]
+    input_ids_2 = input_ids[:, split_index + 1 :]
+    # Convert input_ids to embeddings
+    embeddings_1 = embedding_layer(input_ids_1)
+    embeddings_2 = embedding_layer(input_ids_2)
+    device = image_features.device
+    token_embeddings_part1 = embeddings_1.to(device)
+    token_embeddings_part2 = embeddings_2.to(device)
+    # Concatenate the token embeddings and image features
+    concatenated_embeddings = torch.cat(
+        [token_embeddings_part1, image_features, token_embeddings_part2], dim=1
+    )
+    # Create the corrected attention mask
+    attention_mask = torch.ones(
+        concatenated_embeddings.shape[:2], dtype=torch.long, device=device
+    )
+    return concatenated_embeddings, attention_mask
+def initialize_models():
+    # bnb_config = BitsAndBytesConfig(
+    #     load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
+    # )
+    tokenizer = AutoTokenizer.from_pretrained(
+        #"unsloth/llama-3-8b-Instruct",
+        "E:\Workspace\BAP\LlamaVision\llama-3-8b-Instruct",
+        use_fast=True
+    )
+    model = LlamaForCausalLM.from_pretrained(
+        #"unsloth/llama-3-8b-Instruct",
+        "E:\Workspace\BAP\LlamaVision\llama-3-8b-Instruct",
+        torch_dtype=torch.float16,
+        device_map="auto"
+        #quantization_config=bnb_config,
+    )
+    for param in model.base_model.parameters():
+        param.requires_grad = False
+    #model_name = "google/siglip-so400m-patch14-384"
+    model_name = "E:\Workspace\BAP\LlamaVision\siglip-so400m-patch14-384"
+    vision_model = SiglipVisionModel.from_pretrained(
+        model_name, torch_dtype=torch.float16
+    )
+    processor = SiglipImageProcessor.from_pretrained(model_name)
+    vision_model = vision_model.to("cpu")
+    return tokenizer, model, vision_model, processor
+class ProjectionModule(nn.Module):
+    def __init__(self, mm_hidden_size, hidden_size):
+        super(ProjectionModule, self).__init__()
+        # Directly set up the sequential model
+        self.model = nn.Sequential(
+            nn.Linear(mm_hidden_size, hidden_size),
+            nn.GELU(),
+            nn.Linear(hidden_size, hidden_size),
+        )
+    def forward(self, x):
+        return self.model(x)
+def load_projection_module(mm_hidden_size=1152, hidden_size=4096, device="cpu"):
+    projection_module = ProjectionModule(mm_hidden_size, hidden_size)
+    checkpoint = torch.load("./mm_projector.bin")
+    checkpoint = {k.replace("mm_projector.", ""): v for k, v in checkpoint.items()}
+    projection_module.load_state_dict(checkpoint)
+    projection_module = projection_module.to(device).half()
+    return projection_module
+def answer_question(
+    image_path, tokenizer, model, vision_model, processor, projection_module
+):
+    image = Image.open(image_path).convert("RGB")
+    tokenizer.eos_token = "<|eot_id|>"
+    try:
+        q = input("\nuser: ")
+    except EOFError:
+        q = ""
+    if not q:
+        print("no input detected. exiting.")
+        sys.exit()
+    question = "<image>" + q
+    prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+    input_ids = (
+        tokenizer_image_token(prompt, tokenizer)
+        .unsqueeze(0)
+        .to(model.device)
+    )
+    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    with torch.inference_mode():
+        image_inputs = processor(
+            images=[image],
+            return_tensors="pt",
+            do_resize=True,
+            size={"height": 384, "width": 384},
+        ).to("cpu")
+        image_inputs = image_inputs["pixel_values"].squeeze(0)
+        image_forward_outs = vision_model(
+            image_inputs.to(device="cpu", dtype=torch.float16).unsqueeze(0),
+            output_hidden_states=True,
+        )
+        image_features = image_forward_outs.hidden_states[-2]
+        projected_embeddings = projection_module(image_features).to("cpu")
+        embedding_layer = model.get_input_embeddings()
+        # text_embeddings = embedding_layer(input_ids)
+        new_embeds, attn_mask = process_tensors(
+            input_ids, projected_embeddings, embedding_layer
+        )
+        device = model.device
+        attn_mask = attn_mask.to(device)
+        new_embeds = new_embeds.to(device)
+        model_kwargs = {
+            "do_sample": True,
+            "temperature": 0.2,
+            "max_new_tokens": 2000,
+            "use_cache": True,
+            "streamer": streamer,
+            "pad_token_id": tokenizer.eos_token_id
+        }
+        while True:
+            print('assistant: ')
+            generated_ids = model.generate(
+                inputs_embeds=new_embeds, attention_mask=attn_mask, **model_kwargs
+            )[0]
+            generated_text = tokenizer.decode(generated_ids, skip_special_tokens=False)
+            try:
+                q = input("\nuser: ")
+            except EOFError:
+                q = ""
+            if not q:
+                print("no input detected. exiting.")
+            new_text = (
+                generated_text
+                + "<|start_header_id|>user<|end_header_id|>\n\n"
+                + q
+                + "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+            new_input_ids = tokenizer(new_text, return_tensors="pt").input_ids.to(
+                device
+            )
+            new_embeddings = embedding_layer(new_input_ids)
+            new_embeds = torch.cat([new_embeds, new_embeddings], dim=1)
+            attn_mask = torch.ones(new_embeds.shape[:2], device=device)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Answer questions based on an image")
+    parser.add_argument("-i", "--image", required=True, help="Path to the image file")
+    args = parser.parse_args()
+    tokenizer, model, vision_model, processor = initialize_models()
+    projection_module = load_projection_module()
+    answer_question(
+        args.image,
+        tokenizer,
+        model,
+        vision_model,
+        processor,
+        projection_module,
+    )

assets/Im5.jpg ADDED Viewed

assets/demo-1.jpg ADDED Viewed

assets/demo-2.jpg ADDED Viewed

assets/demo-3.jpg ADDED Viewed

mm_projector.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c67486e883bf7f02b9756850c6f1914e7146936b49805bd3ca8583a71c4d40f
+size 43009661

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+accelerate==0.29.3
+bitsandbytes==0.43.1
+pillow==10.3.0
+torch==2.3.0
+transformers==4.40.1