neiths commited on
Commit
37346c1
1 Parent(s): cf0e3dd

add some important files

Browse files
README.md CHANGED
@@ -1,13 +1,50 @@
1
  ---
2
- title: Llama 3 Vision Gguf
3
- emoji: 💻
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.36.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ language:
3
+ - en
4
+ license: apache-2.0
 
 
 
 
 
 
5
  ---
6
 
7
+ # FOR HF VERSION: https://huggingface.co/qresearch/llama-3-vision-alpha-hf
8
+ # llama3-vision-alpha
9
+
10
+ projection module trained to add vision capabilties to Llama 3 using SigLIP. built by [@yeswondwerr](https://x.com/yeswondwerr) and [@qtnx_](https://x.com/qtnx_)
11
+
12
+ **usage**
13
+
14
+ ```
15
+ pip install -r requirements.txt
16
+ ```
17
+
18
+ ```
19
+ python __main__.py -i image_path
20
+ ```
21
+
22
+ **examples**
23
+
24
+ | Image | Examples |
25
+ | --- | --- |
26
+ | <img src="assets/demo-1.jpg" width="200"/> | **What is the title of this book? answer briefly**<br>The title of the book is "The Little Book of Deep Learning".<br><br>**Where is the person standing? answer briefly**<br>The person is standing on the balcony.<br><br>**Describe the image**<br>The image shows a person holding a book with a cityscape visible through the window behind them. The book has a cover with a title that reads "The Little Book of Deep Learning" in bold letters. |
27
+ | <img src="assets/demo-2.jpg" width="200"/> | **What type of food is the girl holding? answer briefly**<br>A hamburger!<br><br>**What color is the woman's hair? answer briefly**<br>It's white!<br><br>**Describe the image**<br>The image is of a young girl with short, curly hair and a sweet smile, holding a giant hamburger in her hand. She's sitting at a table with a festive dinner setting, surrounded by candles and a warm glow. Her eyes are shining with excitement and contentment as she takes a big bite of the burger. |
28
+
29
+ **acknowledgements**
30
+
31
+ - Liu et al. : [LLaVA](https://arxiv.org/abs/2304.08485)
32
+ - Moon et al. : [AnyMAL](https://arxiv.org/abs/2309.16058)
33
+ - vikhyatk : moondream, test images
34
+ ```
35
+ .x+=:.
36
+ z` ^% .uef^"
37
+ .u . . <k .u . :d88E
38
+ .u@u .d88B :@8c .u .@8Ned8" .u u .d88B :@8c . `888E
39
+ .zWF8888bx ="8888f8888r ud8888. .@^%8888" ud8888. us888u. ="8888f8888r .udR88N 888E .z8k
40
+ .888 9888 4888>'88" :888'8888. x88: `)8b. :888'8888. .@88 "8888" 4888>'88" <888'888k 888E~?888L
41
+ I888 9888 4888> ' d888 '88%" 8888N=*8888 d888 '88%" 9888 9888 4888> ' 9888 'Y" 888E 888E
42
+ I888 9888 4888> 8888.+" %8" R88 8888.+" 9888 9888 4888> 9888 888E 888E
43
+ I888 9888 .d888L .+ 8888L @8Wou 9% 8888L 9888 9888 .d888L .+ 9888 888E 888E
44
+ `888Nx?888 ^"8888*" '8888c. .+ .888888P` '8888c. .+ 9888 9888 ^"8888*" ?8888u../ 888E 888E
45
+ "88" '888 "Y" "88888% ` ^"F "88888% "888*""888" "Y" "8888P' m888N= 888>
46
+ 88E "YP' "YP' ^Y" ^Y' "P' `Y" 888
47
+ 98> J88"
48
+ '8 @%
49
+ ` :"
50
+ ```
__main__.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import sys
3
+ import torch
4
+ import torch.nn as nn
5
+ from PIL import Image
6
+ from transformers import (
7
+ AutoTokenizer,
8
+ BitsAndBytesConfig,
9
+ LlamaForCausalLM,
10
+ SiglipImageProcessor,
11
+ SiglipVisionModel,
12
+ )
13
+ from transformers import TextStreamer
14
+
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+
17
+
18
+ def tokenizer_image_token(prompt, tokenizer, image_token_index=-200):
19
+ prompt_chunks = prompt.split("<image>")
20
+ tokenized_chunks = [tokenizer(chunk).input_ids for chunk in prompt_chunks]
21
+ input_ids = tokenized_chunks[0]
22
+
23
+ for chunk in tokenized_chunks[1:]:
24
+ input_ids.append(image_token_index)
25
+ input_ids.extend(chunk[1:]) # Exclude BOS token on nonzero index
26
+
27
+ return torch.tensor(input_ids, dtype=torch.long)
28
+
29
+
30
+ def process_tensors(input_ids, image_features, embedding_layer):
31
+ # Find the index of -200 in input_ids
32
+ split_index = (input_ids == -200).nonzero(as_tuple=True)[1][0]
33
+
34
+ # Split the input_ids at the index found, excluding -200
35
+ input_ids_1 = input_ids[:, :split_index]
36
+ input_ids_2 = input_ids[:, split_index + 1 :]
37
+
38
+ # Convert input_ids to embeddings
39
+ embeddings_1 = embedding_layer(input_ids_1)
40
+ embeddings_2 = embedding_layer(input_ids_2)
41
+
42
+ device = image_features.device
43
+ token_embeddings_part1 = embeddings_1.to(device)
44
+ token_embeddings_part2 = embeddings_2.to(device)
45
+
46
+ # Concatenate the token embeddings and image features
47
+ concatenated_embeddings = torch.cat(
48
+ [token_embeddings_part1, image_features, token_embeddings_part2], dim=1
49
+ )
50
+
51
+ # Create the corrected attention mask
52
+ attention_mask = torch.ones(
53
+ concatenated_embeddings.shape[:2], dtype=torch.long, device=device
54
+ )
55
+ return concatenated_embeddings, attention_mask
56
+
57
+
58
+ def initialize_models():
59
+ # bnb_config = BitsAndBytesConfig(
60
+ # load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
61
+ # )
62
+
63
+ tokenizer = AutoTokenizer.from_pretrained(
64
+ #"unsloth/llama-3-8b-Instruct",
65
+ "E:\Workspace\BAP\LlamaVision\llama-3-8b-Instruct",
66
+ use_fast=True
67
+ )
68
+ model = LlamaForCausalLM.from_pretrained(
69
+ #"unsloth/llama-3-8b-Instruct",
70
+ "E:\Workspace\BAP\LlamaVision\llama-3-8b-Instruct",
71
+ torch_dtype=torch.float16,
72
+ device_map="auto"
73
+ #quantization_config=bnb_config,
74
+ )
75
+
76
+ for param in model.base_model.parameters():
77
+ param.requires_grad = False
78
+
79
+ #model_name = "google/siglip-so400m-patch14-384"
80
+ model_name = "E:\Workspace\BAP\LlamaVision\siglip-so400m-patch14-384"
81
+ vision_model = SiglipVisionModel.from_pretrained(
82
+ model_name, torch_dtype=torch.float16
83
+ )
84
+ processor = SiglipImageProcessor.from_pretrained(model_name)
85
+
86
+ vision_model = vision_model.to("cpu")
87
+
88
+ return tokenizer, model, vision_model, processor
89
+
90
+
91
+ class ProjectionModule(nn.Module):
92
+ def __init__(self, mm_hidden_size, hidden_size):
93
+ super(ProjectionModule, self).__init__()
94
+
95
+ # Directly set up the sequential model
96
+ self.model = nn.Sequential(
97
+ nn.Linear(mm_hidden_size, hidden_size),
98
+ nn.GELU(),
99
+ nn.Linear(hidden_size, hidden_size),
100
+ )
101
+
102
+ def forward(self, x):
103
+ return self.model(x)
104
+
105
+
106
+ def load_projection_module(mm_hidden_size=1152, hidden_size=4096, device="cpu"):
107
+ projection_module = ProjectionModule(mm_hidden_size, hidden_size)
108
+ checkpoint = torch.load("./mm_projector.bin")
109
+ checkpoint = {k.replace("mm_projector.", ""): v for k, v in checkpoint.items()}
110
+ projection_module.load_state_dict(checkpoint)
111
+ projection_module = projection_module.to(device).half()
112
+ return projection_module
113
+
114
+
115
+ def answer_question(
116
+ image_path, tokenizer, model, vision_model, processor, projection_module
117
+ ):
118
+ image = Image.open(image_path).convert("RGB")
119
+
120
+ tokenizer.eos_token = "<|eot_id|>"
121
+
122
+ try:
123
+ q = input("\nuser: ")
124
+ except EOFError:
125
+ q = ""
126
+ if not q:
127
+ print("no input detected. exiting.")
128
+ sys.exit()
129
+
130
+
131
+ question = "<image>" + q
132
+
133
+ prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
134
+
135
+ input_ids = (
136
+ tokenizer_image_token(prompt, tokenizer)
137
+ .unsqueeze(0)
138
+ .to(model.device)
139
+ )
140
+
141
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
142
+
143
+ with torch.inference_mode():
144
+ image_inputs = processor(
145
+ images=[image],
146
+ return_tensors="pt",
147
+ do_resize=True,
148
+ size={"height": 384, "width": 384},
149
+ ).to("cpu")
150
+
151
+ image_inputs = image_inputs["pixel_values"].squeeze(0)
152
+
153
+ image_forward_outs = vision_model(
154
+ image_inputs.to(device="cpu", dtype=torch.float16).unsqueeze(0),
155
+ output_hidden_states=True,
156
+ )
157
+
158
+ image_features = image_forward_outs.hidden_states[-2]
159
+
160
+ projected_embeddings = projection_module(image_features).to("cpu")
161
+
162
+ embedding_layer = model.get_input_embeddings()
163
+ # text_embeddings = embedding_layer(input_ids)
164
+
165
+ new_embeds, attn_mask = process_tensors(
166
+ input_ids, projected_embeddings, embedding_layer
167
+ )
168
+ device = model.device
169
+ attn_mask = attn_mask.to(device)
170
+ new_embeds = new_embeds.to(device)
171
+
172
+ model_kwargs = {
173
+ "do_sample": True,
174
+ "temperature": 0.2,
175
+ "max_new_tokens": 2000,
176
+ "use_cache": True,
177
+ "streamer": streamer,
178
+ "pad_token_id": tokenizer.eos_token_id
179
+ }
180
+
181
+ while True:
182
+ print('assistant: ')
183
+ generated_ids = model.generate(
184
+ inputs_embeds=new_embeds, attention_mask=attn_mask, **model_kwargs
185
+ )[0]
186
+
187
+ generated_text = tokenizer.decode(generated_ids, skip_special_tokens=False)
188
+ try:
189
+ q = input("\nuser: ")
190
+ except EOFError:
191
+ q = ""
192
+ if not q:
193
+ print("no input detected. exiting.")
194
+
195
+ new_text = (
196
+ generated_text
197
+ + "<|start_header_id|>user<|end_header_id|>\n\n"
198
+ + q
199
+ + "<|start_header_id|>assistant<|end_header_id|>\n\n"
200
+ )
201
+ new_input_ids = tokenizer(new_text, return_tensors="pt").input_ids.to(
202
+ device
203
+ )
204
+ new_embeddings = embedding_layer(new_input_ids)
205
+
206
+ new_embeds = torch.cat([new_embeds, new_embeddings], dim=1)
207
+ attn_mask = torch.ones(new_embeds.shape[:2], device=device)
208
+
209
+
210
+ if __name__ == "__main__":
211
+ parser = argparse.ArgumentParser(description="Answer questions based on an image")
212
+ parser.add_argument("-i", "--image", required=True, help="Path to the image file")
213
+ args = parser.parse_args()
214
+
215
+ tokenizer, model, vision_model, processor = initialize_models()
216
+ projection_module = load_projection_module()
217
+
218
+ answer_question(
219
+ args.image,
220
+ tokenizer,
221
+ model,
222
+ vision_model,
223
+ processor,
224
+ projection_module,
225
+ )
assets/Im5.jpg ADDED
assets/demo-1.jpg ADDED
assets/demo-2.jpg ADDED
assets/demo-3.jpg ADDED
mm_projector.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c67486e883bf7f02b9756850c6f1914e7146936b49805bd3ca8583a71c4d40f
3
+ size 43009661
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ accelerate==0.29.3
2
+ bitsandbytes==0.43.1
3
+ pillow==10.3.0
4
+ torch==2.3.0
5
+ transformers==4.40.1