Spaces:
Sleeping
Sleeping
Narayana02
commited on
Commit
•
81dbae6
1
Parent(s):
8de60e6
Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
import
|
2 |
import torch
|
3 |
import onnxruntime as ort
|
4 |
from PIL import Image
|
@@ -7,13 +7,9 @@ import numpy as np
|
|
7 |
from transformers import AutoTokenizer, AutoProcessor
|
8 |
import os
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
|
14 |
-
if not os.path.exists("embed_tokens_q4f16.onnx"):
|
15 |
-
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
|
16 |
-
|
17 |
# Load the tokenizer and processor
|
18 |
tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
|
19 |
processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
|
@@ -22,48 +18,51 @@ vision_encoder_session = ort.InferenceSession("vision_encoder_q4f16.onnx")
|
|
22 |
decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
|
23 |
embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
|
24 |
|
25 |
-
def merge_input_ids_with_image_features(image_features, inputs_embeds, input_ids, attention_mask,
|
26 |
num_images, num_image_patches, embed_dim = image_features.shape
|
27 |
batch_size, sequence_length = input_ids.shape
|
28 |
-
|
|
|
29 |
special_image_token_mask = input_ids == special_image_token_id
|
30 |
num_special_image_tokens = np.sum(special_image_token_mask, axis=-1)
|
31 |
-
|
32 |
max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
|
33 |
batch_indices, non_image_indices = np.where(input_ids != special_image_token_id)
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
35 |
new_token_positions = np.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
|
36 |
nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
|
37 |
-
|
38 |
-
|
39 |
text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
|
40 |
-
|
|
|
41 |
final_embedding = np.zeros((batch_size, max_embed_dim, embed_dim), dtype=np.float32)
|
42 |
final_attention_mask = np.zeros((batch_size, max_embed_dim), dtype=np.int64)
|
43 |
-
|
|
|
|
|
44 |
final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
|
45 |
final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
|
46 |
-
|
47 |
-
image_to_overwrite = np.full((batch_size, max_embed_dim),
|
48 |
image_to_overwrite[batch_indices, text_to_overwrite] = False
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
num_image_positions = len(image_positions[0])
|
53 |
-
assert num_image_positions <= image_features.size, "Mismatch in image feature positions and available features."
|
54 |
-
|
55 |
-
# Assign reshaped image features
|
56 |
-
reshaped_image_features = image_features.reshape(-1, embed_dim)[:num_image_positions]
|
57 |
-
final_embedding[image_positions] = reshaped_image_features
|
58 |
final_attention_mask = np.logical_or(final_attention_mask, image_to_overwrite).astype(final_attention_mask.dtype)
|
59 |
-
|
60 |
position_ids = final_attention_mask.cumsum(axis=-1) - 1
|
61 |
position_ids = np.where(final_attention_mask == 0, 1, position_ids)
|
62 |
-
|
|
|
63 |
batch_indices, pad_indices = np.where(input_ids == pad_token_id)
|
64 |
indices_to_mask = new_token_positions[batch_indices, pad_indices]
|
65 |
final_embedding[batch_indices, indices_to_mask] = 0
|
66 |
-
|
67 |
return final_embedding, final_attention_mask, position_ids
|
68 |
|
69 |
# Load model and processor
|
@@ -199,10 +198,14 @@ def describe_image(image):
|
|
199 |
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
|
200 |
return decoded_output
|
201 |
|
202 |
-
#
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
import torch
|
3 |
import onnxruntime as ort
|
4 |
from PIL import Image
|
|
|
7 |
from transformers import AutoTokenizer, AutoProcessor
|
8 |
import os
|
9 |
|
10 |
+
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
|
11 |
+
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
|
12 |
+
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/vision_encoder_q4f16.onnx')
|
|
|
|
|
|
|
|
|
13 |
# Load the tokenizer and processor
|
14 |
tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
|
15 |
processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
|
|
|
18 |
decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
|
19 |
embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
|
20 |
|
21 |
+
def merge_input_ids_with_image_features(image_features, inputs_embeds, input_ids, attention_mask,pad_token_id,special_image_token_id):
|
22 |
num_images, num_image_patches, embed_dim = image_features.shape
|
23 |
batch_size, sequence_length = input_ids.shape
|
24 |
+
left_padding = not np.sum(input_ids[:, -1] == pad_token_id)
|
25 |
+
# 1. Create a mask to know where special image tokens are
|
26 |
special_image_token_mask = input_ids == special_image_token_id
|
27 |
num_special_image_tokens = np.sum(special_image_token_mask, axis=-1)
|
28 |
+
# Compute the maximum embed dimension
|
29 |
max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
|
30 |
batch_indices, non_image_indices = np.where(input_ids != special_image_token_id)
|
31 |
+
|
32 |
+
# 2. Compute the positions where text should be written
|
33 |
+
# Calculate new positions for text tokens in merged image-text sequence.
|
34 |
+
# `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
|
35 |
+
# `np.cumsum` computes how each image token shifts subsequent text token positions.
|
36 |
+
# - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
|
37 |
new_token_positions = np.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
|
38 |
nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
|
39 |
+
if left_padding:
|
40 |
+
new_token_positions += nb_image_pad[:, None] # offset for left padding
|
41 |
text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
|
42 |
+
|
43 |
+
# 3. Create the full embedding, already padded to the maximum position
|
44 |
final_embedding = np.zeros((batch_size, max_embed_dim, embed_dim), dtype=np.float32)
|
45 |
final_attention_mask = np.zeros((batch_size, max_embed_dim), dtype=np.int64)
|
46 |
+
|
47 |
+
# 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
|
48 |
+
# we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
|
49 |
final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
|
50 |
final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
|
51 |
+
# 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
|
52 |
+
image_to_overwrite = np.full((batch_size, max_embed_dim), True)
|
53 |
image_to_overwrite[batch_indices, text_to_overwrite] = False
|
54 |
+
image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None]
|
55 |
+
|
56 |
+
final_embedding[image_to_overwrite] = image_features.reshape(-1, embed_dim)
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
final_attention_mask = np.logical_or(final_attention_mask, image_to_overwrite).astype(final_attention_mask.dtype)
|
|
|
58 |
position_ids = final_attention_mask.cumsum(axis=-1) - 1
|
59 |
position_ids = np.where(final_attention_mask == 0, 1, position_ids)
|
60 |
+
|
61 |
+
# 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
|
62 |
batch_indices, pad_indices = np.where(input_ids == pad_token_id)
|
63 |
indices_to_mask = new_token_positions[batch_indices, pad_indices]
|
64 |
final_embedding[batch_indices, indices_to_mask] = 0
|
65 |
+
|
66 |
return final_embedding, final_attention_mask, position_ids
|
67 |
|
68 |
# Load model and processor
|
|
|
198 |
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
|
199 |
return decoded_output
|
200 |
|
201 |
+
# Create Gradio interface
|
202 |
+
interface = gr.Interface(
|
203 |
+
fn=describe_image,
|
204 |
+
inputs=gr.Image(type="pil"),
|
205 |
+
outputs=gr.Textbox(lines=5, placeholder="Description will appear here"),
|
206 |
+
title="Image Description Generator",
|
207 |
+
description="Upload an image to get a detailed description."
|
208 |
+
)
|
209 |
+
|
210 |
+
# Enable API
|
211 |
+
interface.launch(share=True,show_error=True,debug=True)
|