Spaces:
Runtime error
Runtime error
File size: 6,814 Bytes
c932ee3 a65d678 c932ee3 bf0383a c932ee3 bf0383a c932ee3 bf0383a c932ee3 bf0383a c932ee3 a65d678 c932ee3 a65d678 c932ee3 a65d678 c932ee3 a65d678 c932ee3 a65d678 c932ee3 a65d678 c932ee3 a65d678 c932ee3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import gradio as gr
from gradio_client import Client
import os
import json
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import torch
from PIL import Image
import requests
import spaces
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
model.to("device")
def postprocess_kosmos_out(result):
token = ""
for res in result[1]:
token += res["token"]
return token
def generate_caption_fuyu(image_path, caption_bool):
try:
from gradio_client import Client
client = Client("adept/fuyu-8b-demo")
result = client.predict(
image_path,
caption_bool,
fn_index=2
)
return result
except Exception as e:
print(e)
gr.Warning("The Fuyu-8B Space is currently unavailable. Please try again later.")
return ""
def generate_answer_fuyu(image_path, question):
try:
from gradio_client import Client
client = Client("adept/fuyu-8b-demo")
result = client.predict(
image_path,
question,
fn_index=3
)
print(result)
return result
except Exception as e:
print(e)
gr.Warning("The Fuyu-8B Space is currently unavailable. Please try again later.")
return ""
def generate_caption_kosmos(image_path, caption_bool):
client = Client("merve/kosmos2")
try:
if caption_bool:
caption = "Detailed"
else:
caption = "Brief"
result = client.predict(image_path, caption, None,
api_name="/generate_predictions"
)
return postprocess_kosmos_out(result)
except Exception as e:
print(e)
gr.Warning("The KOSMOS-2 Space is currently unavailable. Please try again later.")
return ""
def generate_answer_kosmos(image_path, question):
try:
from gradio_client import Client
client = Client("merve/kosmos2")
result = client.predict(
image_path,
None,
question,
fn_index=3
)
return postprocess_kosmos_out(result)
except Exception as e:
print(e)
gr.Warning("The KOSMOS-2 Space is currently unavailable. Please try again later.")
return ""
def generate_caption(image_path, caption_bool):
kosmos_caption = generate_caption_kosmos(image_path, caption_bool)
fuyu_caption = generate_caption_fuyu(image_path, caption_bool)
llava_caption = generate_caption_llava(image_path, caption_bool)
return kosmos_caption, fuyu_caption, llava_caption
def generate_answers(image_path, question):
kosmos_answer = generate_answer_kosmos(image_path, question)
fuyu_answer = generate_answer_fuyu(image_path, question)
llava_answer = generate_answer_llava(image_path, question)
return kosmos_answer, fuyu_answer, llava_answer
@spaces.GPU
def generate_caption_llava(image_path, caption_bool):
if caption_bool:
text_prompt =f"[INST] \nCaption this image in detail in objective manner.[/INST]"
else:
text_prompt =f"[INST] \nCaption this image briefly in objective manner. [/INST]"
inputs = processor(prompt, Image.open(image_path), return_tensors="pt").to(device)
# autoregressively complete prompt
output = model.generate(**inputs, max_new_tokens=100)
return processor.decode(output[0], skip_special_tokens=True)["generated_text"][len(text_prompt):]
@spaces.GPU
def generate_answer_llava(image_path, question):
text_prompt =f"[INST] \n{question} [/INST]"
inputs = processor(prompt, Image.open(image_path), return_tensors="pt").to(device)
output = model.generate(**inputs, max_new_tokens=100)
return processor.decode(output[0], skip_special_tokens=True)["generated_text"][len(text_prompt):]
title = "# Comparing Vision Language Models"
css = """
#mkd {
height: 500px;
overflow: auto;
border: 1px solid #ccc;
}
"""
with gr.Blocks(css=css) as demo:
gr.HTML("<h1><center>Compare Vision Language Models 🖼️ 💬 <center><h1>")
gr.Markdown("Vision Language Models are essentially language models with a capability of understanding images.")
gr.Markdown("To try this Space, simply try either captioning or visual question answering. ")
gr.Markdown("If prompted to wait and try again, please try again. This Space uses other Spaces as APIs, so it might take time to get those Spaces up and running if they're stopped.")
gr.Markdown("Lastly, Fuyu-8B and KOSMOS-2 has the capability of locating images in object detection-like manner. Feel free to try them in their own Spaces.")
with gr.Row():
with gr.Tab("Visual Question Answering"):
with gr.Column():
input_image = gr.Image(label = "Input Image", type="filepath")
question = gr.Textbox(label = "Question")
run_button = gr.Button("Answer")
with gr.Column():
answer_kosmos = gr.Textbox(label="Answer generated by KOSMOS-2")
answer_fuyu = gr.Textbox(label="Answer generated by Fuyu-8B")
answer_llava = gr.Textbox(label="Answer generated by LLaVA-NeXT")
outputs_answer = [
answer_kosmos, answer_fuyu, answer_llava
]
gr.Examples(
examples = [["./cat.png", "What is behind the cat?"]],
inputs=[input_image, question],
outputs=outputs_answer,
fn=generate_answers,
cache_examples=True
)
run_button.click(
fn=generate_answers,
inputs=[input_image,question],
outputs=outputs_answer
)
with gr.Tab("Image Captioning"):
with gr.Column():
input_image = gr.Image(label = "Input Image", type="filepath")
detailed_caption = gr.Checkbox(label = "Detailed Captioning")
run_button = gr.Button("Caption")
with gr.Column():
caption_kosmos = gr.Textbox(label="Caption generated by KOSMOS-2")
caption_fuyu = gr.Textbox(label="Caption generated by Fuyu-8B")
outputs_caption = [caption_kosmos, caption_fuyu, caption_llava]
gr.Examples(
examples = [["./cat.png", True], ["./cat.png", False]],
inputs=[input_image, detailed_caption],
outputs=outputs_caption,
fn=generate_caption,
cache_examples=True
)
run_button.click(
fn=generate_caption,
inputs=[input_image,detailed_caption],
outputs=outputs_caption
)
if __name__ == "__main__":
demo.queue().launch(debug=True) |