merve HF staff commited on
Commit
a65d678
·
verified ·
1 Parent(s): bf0383a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -6
app.py CHANGED
@@ -2,6 +2,19 @@ import gradio as gr
2
  from gradio_client import Client
3
  import os
4
  import json
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def postprocess_kosmos_out(result):
7
  token = ""
@@ -84,16 +97,42 @@ def generate_caption(image_path, caption_bool):
84
 
85
  kosmos_caption = generate_caption_kosmos(image_path, caption_bool)
86
  fuyu_caption = generate_caption_fuyu(image_path, caption_bool)
 
87
 
88
- return kosmos_caption, fuyu_caption
89
 
90
 
91
  def generate_answers(image_path, question):
92
 
93
  kosmos_answer = generate_answer_kosmos(image_path, question)
94
  fuyu_answer = generate_answer_fuyu(image_path, question)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- return kosmos_answer, fuyu_answer
97
 
98
  title = "# Comparing Vision Language Models"
99
 
@@ -116,14 +155,14 @@ with gr.Blocks(css=css) as demo:
116
  with gr.Tab("Visual Question Answering"):
117
  with gr.Column():
118
  input_image = gr.Image(label = "Input Image", type="filepath")
119
- question = gr.Textbox(label = "question")
120
  run_button = gr.Button("Answer")
121
  with gr.Column():
122
  answer_kosmos = gr.Textbox(label="Answer generated by KOSMOS-2")
123
  answer_fuyu = gr.Textbox(label="Answer generated by Fuyu-8B")
124
-
125
  outputs_answer = [
126
- answer_kosmos, answer_fuyu
127
  ]
128
 
129
  gr.Examples(
@@ -149,7 +188,7 @@ with gr.Blocks(css=css) as demo:
149
  caption_kosmos = gr.Textbox(label="Caption generated by KOSMOS-2")
150
  caption_fuyu = gr.Textbox(label="Caption generated by Fuyu-8B")
151
 
152
- outputs_caption = [caption_kosmos, caption_fuyu]
153
 
154
  gr.Examples(
155
  examples = [["./cat.png", True], ["./cat.png", False]],
 
2
  from gradio_client import Client
3
  import os
4
  import json
5
+ from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
6
+ import torch
7
+ from PIL import Image
8
+ import requests
9
+ import spaces
10
+
11
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
12
+
13
+ processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
14
+
15
+ model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
16
+ model.to("device")
17
+
18
 
19
  def postprocess_kosmos_out(result):
20
  token = ""
 
97
 
98
  kosmos_caption = generate_caption_kosmos(image_path, caption_bool)
99
  fuyu_caption = generate_caption_fuyu(image_path, caption_bool)
100
+ llava_caption = generate_caption_llava(image_path, caption_bool)
101
 
102
+ return kosmos_caption, fuyu_caption, llava_caption
103
 
104
 
105
  def generate_answers(image_path, question):
106
 
107
  kosmos_answer = generate_answer_kosmos(image_path, question)
108
  fuyu_answer = generate_answer_fuyu(image_path, question)
109
+ llava_answer = generate_answer_llava(image_path, question)
110
+
111
+ return kosmos_answer, fuyu_answer, llava_answer
112
+
113
+
114
+ @spaces.GPU
115
+ def generate_caption_llava(image_path, caption_bool):
116
+ if caption_bool:
117
+ text_prompt =f"[INST] \nCaption this image in detail in objective manner.[/INST]"
118
+ else:
119
+ text_prompt =f"[INST] \nCaption this image briefly in objective manner. [/INST]"
120
+
121
+ inputs = processor(prompt, Image.open(image_path), return_tensors="pt").to(device)
122
+
123
+ # autoregressively complete prompt
124
+ output = model.generate(**inputs, max_new_tokens=100)
125
+
126
+ return processor.decode(output[0], skip_special_tokens=True)["generated_text"][len(text_prompt):]
127
+
128
+ @spaces.GPU
129
+ def generate_answer_llava(image_path, question):
130
+ text_prompt =f"[INST] \n{question} [/INST]"
131
+ inputs = processor(prompt, Image.open(image_path), return_tensors="pt").to(device)
132
+ output = model.generate(**inputs, max_new_tokens=100)
133
+ return processor.decode(output[0], skip_special_tokens=True)["generated_text"][len(text_prompt):]
134
+
135
 
 
136
 
137
  title = "# Comparing Vision Language Models"
138
 
 
155
  with gr.Tab("Visual Question Answering"):
156
  with gr.Column():
157
  input_image = gr.Image(label = "Input Image", type="filepath")
158
+ question = gr.Textbox(label = "Question")
159
  run_button = gr.Button("Answer")
160
  with gr.Column():
161
  answer_kosmos = gr.Textbox(label="Answer generated by KOSMOS-2")
162
  answer_fuyu = gr.Textbox(label="Answer generated by Fuyu-8B")
163
+ answer_llava = gr.Textbox(label="Answer generated by LLaVA-NeXT")
164
  outputs_answer = [
165
+ answer_kosmos, answer_fuyu, answer_llava
166
  ]
167
 
168
  gr.Examples(
 
188
  caption_kosmos = gr.Textbox(label="Caption generated by KOSMOS-2")
189
  caption_fuyu = gr.Textbox(label="Caption generated by Fuyu-8B")
190
 
191
+ outputs_caption = [caption_kosmos, caption_fuyu, caption_llava]
192
 
193
  gr.Examples(
194
  examples = [["./cat.png", True], ["./cat.png", False]],