Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,19 @@ import gradio as gr
|
|
2 |
from gradio_client import Client
|
3 |
import os
|
4 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
def postprocess_kosmos_out(result):
|
7 |
token = ""
|
@@ -84,16 +97,42 @@ def generate_caption(image_path, caption_bool):
|
|
84 |
|
85 |
kosmos_caption = generate_caption_kosmos(image_path, caption_bool)
|
86 |
fuyu_caption = generate_caption_fuyu(image_path, caption_bool)
|
|
|
87 |
|
88 |
-
return kosmos_caption, fuyu_caption
|
89 |
|
90 |
|
91 |
def generate_answers(image_path, question):
|
92 |
|
93 |
kosmos_answer = generate_answer_kosmos(image_path, question)
|
94 |
fuyu_answer = generate_answer_fuyu(image_path, question)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
-
return kosmos_answer, fuyu_answer
|
97 |
|
98 |
title = "# Comparing Vision Language Models"
|
99 |
|
@@ -116,14 +155,14 @@ with gr.Blocks(css=css) as demo:
|
|
116 |
with gr.Tab("Visual Question Answering"):
|
117 |
with gr.Column():
|
118 |
input_image = gr.Image(label = "Input Image", type="filepath")
|
119 |
-
question = gr.Textbox(label = "
|
120 |
run_button = gr.Button("Answer")
|
121 |
with gr.Column():
|
122 |
answer_kosmos = gr.Textbox(label="Answer generated by KOSMOS-2")
|
123 |
answer_fuyu = gr.Textbox(label="Answer generated by Fuyu-8B")
|
124 |
-
|
125 |
outputs_answer = [
|
126 |
-
answer_kosmos, answer_fuyu
|
127 |
]
|
128 |
|
129 |
gr.Examples(
|
@@ -149,7 +188,7 @@ with gr.Blocks(css=css) as demo:
|
|
149 |
caption_kosmos = gr.Textbox(label="Caption generated by KOSMOS-2")
|
150 |
caption_fuyu = gr.Textbox(label="Caption generated by Fuyu-8B")
|
151 |
|
152 |
-
outputs_caption = [caption_kosmos, caption_fuyu]
|
153 |
|
154 |
gr.Examples(
|
155 |
examples = [["./cat.png", True], ["./cat.png", False]],
|
|
|
2 |
from gradio_client import Client
|
3 |
import os
|
4 |
import json
|
5 |
+
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
|
6 |
+
import torch
|
7 |
+
from PIL import Image
|
8 |
+
import requests
|
9 |
+
import spaces
|
10 |
+
|
11 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
12 |
+
|
13 |
+
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
|
14 |
+
|
15 |
+
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
|
16 |
+
model.to("device")
|
17 |
+
|
18 |
|
19 |
def postprocess_kosmos_out(result):
|
20 |
token = ""
|
|
|
97 |
|
98 |
kosmos_caption = generate_caption_kosmos(image_path, caption_bool)
|
99 |
fuyu_caption = generate_caption_fuyu(image_path, caption_bool)
|
100 |
+
llava_caption = generate_caption_llava(image_path, caption_bool)
|
101 |
|
102 |
+
return kosmos_caption, fuyu_caption, llava_caption
|
103 |
|
104 |
|
105 |
def generate_answers(image_path, question):
|
106 |
|
107 |
kosmos_answer = generate_answer_kosmos(image_path, question)
|
108 |
fuyu_answer = generate_answer_fuyu(image_path, question)
|
109 |
+
llava_answer = generate_answer_llava(image_path, question)
|
110 |
+
|
111 |
+
return kosmos_answer, fuyu_answer, llava_answer
|
112 |
+
|
113 |
+
|
114 |
+
@spaces.GPU
|
115 |
+
def generate_caption_llava(image_path, caption_bool):
|
116 |
+
if caption_bool:
|
117 |
+
text_prompt =f"[INST] \nCaption this image in detail in objective manner.[/INST]"
|
118 |
+
else:
|
119 |
+
text_prompt =f"[INST] \nCaption this image briefly in objective manner. [/INST]"
|
120 |
+
|
121 |
+
inputs = processor(prompt, Image.open(image_path), return_tensors="pt").to(device)
|
122 |
+
|
123 |
+
# autoregressively complete prompt
|
124 |
+
output = model.generate(**inputs, max_new_tokens=100)
|
125 |
+
|
126 |
+
return processor.decode(output[0], skip_special_tokens=True)["generated_text"][len(text_prompt):]
|
127 |
+
|
128 |
+
@spaces.GPU
|
129 |
+
def generate_answer_llava(image_path, question):
|
130 |
+
text_prompt =f"[INST] \n{question} [/INST]"
|
131 |
+
inputs = processor(prompt, Image.open(image_path), return_tensors="pt").to(device)
|
132 |
+
output = model.generate(**inputs, max_new_tokens=100)
|
133 |
+
return processor.decode(output[0], skip_special_tokens=True)["generated_text"][len(text_prompt):]
|
134 |
+
|
135 |
|
|
|
136 |
|
137 |
title = "# Comparing Vision Language Models"
|
138 |
|
|
|
155 |
with gr.Tab("Visual Question Answering"):
|
156 |
with gr.Column():
|
157 |
input_image = gr.Image(label = "Input Image", type="filepath")
|
158 |
+
question = gr.Textbox(label = "Question")
|
159 |
run_button = gr.Button("Answer")
|
160 |
with gr.Column():
|
161 |
answer_kosmos = gr.Textbox(label="Answer generated by KOSMOS-2")
|
162 |
answer_fuyu = gr.Textbox(label="Answer generated by Fuyu-8B")
|
163 |
+
answer_llava = gr.Textbox(label="Answer generated by LLaVA-NeXT")
|
164 |
outputs_answer = [
|
165 |
+
answer_kosmos, answer_fuyu, answer_llava
|
166 |
]
|
167 |
|
168 |
gr.Examples(
|
|
|
188 |
caption_kosmos = gr.Textbox(label="Caption generated by KOSMOS-2")
|
189 |
caption_fuyu = gr.Textbox(label="Caption generated by Fuyu-8B")
|
190 |
|
191 |
+
outputs_caption = [caption_kosmos, caption_fuyu, caption_llava]
|
192 |
|
193 |
gr.Examples(
|
194 |
examples = [["./cat.png", True], ["./cat.png", False]],
|