Narayana02 commited on
Commit
81dbae6
1 Parent(s): 8de60e6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -37
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import streamlit as st
2
  import torch
3
  import onnxruntime as ort
4
  from PIL import Image
@@ -7,13 +7,9 @@ import numpy as np
7
  from transformers import AutoTokenizer, AutoProcessor
8
  import os
9
 
10
- if not os.path.exists("vision_encoder_q4f16.onnx"):
11
- os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/vision_encoder_q4f16.onnx')
12
- if not os.path.exists("decoder_model_merged_q4f16.onnx"):
13
- os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
14
- if not os.path.exists("embed_tokens_q4f16.onnx"):
15
- os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
16
-
17
  # Load the tokenizer and processor
18
  tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
19
  processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
@@ -22,48 +18,51 @@ vision_encoder_session = ort.InferenceSession("vision_encoder_q4f16.onnx")
22
  decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
23
  embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
24
 
25
- def merge_input_ids_with_image_features(image_features, inputs_embeds, input_ids, attention_mask, pad_token_id, special_image_token_id):
26
  num_images, num_image_patches, embed_dim = image_features.shape
27
  batch_size, sequence_length = input_ids.shape
28
-
 
29
  special_image_token_mask = input_ids == special_image_token_id
30
  num_special_image_tokens = np.sum(special_image_token_mask, axis=-1)
31
-
32
  max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
33
  batch_indices, non_image_indices = np.where(input_ids != special_image_token_id)
34
-
 
 
 
 
 
35
  new_token_positions = np.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
36
  nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
37
- new_token_positions += nb_image_pad[:, None]
38
-
39
  text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
40
-
 
41
  final_embedding = np.zeros((batch_size, max_embed_dim, embed_dim), dtype=np.float32)
42
  final_attention_mask = np.zeros((batch_size, max_embed_dim), dtype=np.int64)
43
-
 
 
44
  final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
45
  final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
46
-
47
- image_to_overwrite = np.full((batch_size, max_embed_dim), False)
48
  image_to_overwrite[batch_indices, text_to_overwrite] = False
49
- image_positions = np.where(np.logical_not(image_to_overwrite))
50
-
51
- # Ensure proper reshaping
52
- num_image_positions = len(image_positions[0])
53
- assert num_image_positions <= image_features.size, "Mismatch in image feature positions and available features."
54
-
55
- # Assign reshaped image features
56
- reshaped_image_features = image_features.reshape(-1, embed_dim)[:num_image_positions]
57
- final_embedding[image_positions] = reshaped_image_features
58
  final_attention_mask = np.logical_or(final_attention_mask, image_to_overwrite).astype(final_attention_mask.dtype)
59
-
60
  position_ids = final_attention_mask.cumsum(axis=-1) - 1
61
  position_ids = np.where(final_attention_mask == 0, 1, position_ids)
62
-
 
63
  batch_indices, pad_indices = np.where(input_ids == pad_token_id)
64
  indices_to_mask = new_token_positions[batch_indices, pad_indices]
65
  final_embedding[batch_indices, indices_to_mask] = 0
66
-
67
  return final_embedding, final_attention_mask, position_ids
68
 
69
  # Load model and processor
@@ -199,10 +198,14 @@ def describe_image(image):
199
  decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
200
  return decoded_output
201
 
202
- # Streamlit app
203
- st.title("Image Description Generator")
204
- uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
205
- if uploaded_image is not None:
206
- image = Image.open(uploaded_image)
207
- description = describe_image(image)
208
- st.text_area("Description", description, height=300)
 
 
 
 
 
1
+ import gradio as gr
2
  import torch
3
  import onnxruntime as ort
4
  from PIL import Image
 
7
  from transformers import AutoTokenizer, AutoProcessor
8
  import os
9
 
10
+ os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
11
+ os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
12
+ os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/vision_encoder_q4f16.onnx')
 
 
 
 
13
  # Load the tokenizer and processor
14
  tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
15
  processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
 
18
  decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
19
  embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
20
 
21
+ def merge_input_ids_with_image_features(image_features, inputs_embeds, input_ids, attention_mask,pad_token_id,special_image_token_id):
22
  num_images, num_image_patches, embed_dim = image_features.shape
23
  batch_size, sequence_length = input_ids.shape
24
+ left_padding = not np.sum(input_ids[:, -1] == pad_token_id)
25
+ # 1. Create a mask to know where special image tokens are
26
  special_image_token_mask = input_ids == special_image_token_id
27
  num_special_image_tokens = np.sum(special_image_token_mask, axis=-1)
28
+ # Compute the maximum embed dimension
29
  max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
30
  batch_indices, non_image_indices = np.where(input_ids != special_image_token_id)
31
+
32
+ # 2. Compute the positions where text should be written
33
+ # Calculate new positions for text tokens in merged image-text sequence.
34
+ # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
35
+ # `np.cumsum` computes how each image token shifts subsequent text token positions.
36
+ # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
37
  new_token_positions = np.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
38
  nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
39
+ if left_padding:
40
+ new_token_positions += nb_image_pad[:, None] # offset for left padding
41
  text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
42
+
43
+ # 3. Create the full embedding, already padded to the maximum position
44
  final_embedding = np.zeros((batch_size, max_embed_dim, embed_dim), dtype=np.float32)
45
  final_attention_mask = np.zeros((batch_size, max_embed_dim), dtype=np.int64)
46
+
47
+ # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
48
+ # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
49
  final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
50
  final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
51
+ # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
52
+ image_to_overwrite = np.full((batch_size, max_embed_dim), True)
53
  image_to_overwrite[batch_indices, text_to_overwrite] = False
54
+ image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None]
55
+
56
+ final_embedding[image_to_overwrite] = image_features.reshape(-1, embed_dim)
 
 
 
 
 
 
57
  final_attention_mask = np.logical_or(final_attention_mask, image_to_overwrite).astype(final_attention_mask.dtype)
 
58
  position_ids = final_attention_mask.cumsum(axis=-1) - 1
59
  position_ids = np.where(final_attention_mask == 0, 1, position_ids)
60
+
61
+ # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
62
  batch_indices, pad_indices = np.where(input_ids == pad_token_id)
63
  indices_to_mask = new_token_positions[batch_indices, pad_indices]
64
  final_embedding[batch_indices, indices_to_mask] = 0
65
+
66
  return final_embedding, final_attention_mask, position_ids
67
 
68
  # Load model and processor
 
198
  decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
199
  return decoded_output
200
 
201
+ # Create Gradio interface
202
+ interface = gr.Interface(
203
+ fn=describe_image,
204
+ inputs=gr.Image(type="pil"),
205
+ outputs=gr.Textbox(lines=5, placeholder="Description will appear here"),
206
+ title="Image Description Generator",
207
+ description="Upload an image to get a detailed description."
208
+ )
209
+
210
+ # Enable API
211
+ interface.launch(share=True,show_error=True,debug=True)