import gradio as gr from gradio_client import Client import os import json from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration import torch from PIL import Image import requests import spaces device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) model.to("device") def postprocess_kosmos_out(result): token = "" for res in result[1]: token += res["token"] return token def generate_caption_fuyu(image_path, caption_bool): try: from gradio_client import Client client = Client("adept/fuyu-8b-demo") result = client.predict( image_path, caption_bool, fn_index=2 ) return result except Exception as e: print(e) gr.Warning("The Fuyu-8B Space is currently unavailable. Please try again later.") return "" def generate_answer_fuyu(image_path, question): try: from gradio_client import Client client = Client("adept/fuyu-8b-demo") result = client.predict( image_path, question, fn_index=3 ) print(result) return result except Exception as e: print(e) gr.Warning("The Fuyu-8B Space is currently unavailable. Please try again later.") return "" def generate_caption_kosmos(image_path, caption_bool): client = Client("merve/kosmos2") try: if caption_bool: caption = "Detailed" else: caption = "Brief" result = client.predict(image_path, caption, None, api_name="/generate_predictions" ) return postprocess_kosmos_out(result) except Exception as e: print(e) gr.Warning("The KOSMOS-2 Space is currently unavailable. Please try again later.") return "" def generate_answer_kosmos(image_path, question): try: from gradio_client import Client client = Client("merve/kosmos2") result = client.predict( image_path, None, question, fn_index=3 ) return postprocess_kosmos_out(result) except Exception as e: print(e) gr.Warning("The KOSMOS-2 Space is currently unavailable. Please try again later.") return "" def generate_caption(image_path, caption_bool): kosmos_caption = generate_caption_kosmos(image_path, caption_bool) fuyu_caption = generate_caption_fuyu(image_path, caption_bool) llava_caption = generate_caption_llava(image_path, caption_bool) return kosmos_caption, fuyu_caption, llava_caption def generate_answers(image_path, question): kosmos_answer = generate_answer_kosmos(image_path, question) fuyu_answer = generate_answer_fuyu(image_path, question) llava_answer = generate_answer_llava(image_path, question) return kosmos_answer, fuyu_answer, llava_answer @spaces.GPU def generate_caption_llava(image_path, caption_bool): if caption_bool: text_prompt =f"[INST] \nCaption this image in detail in objective manner.[/INST]" else: text_prompt =f"[INST] \nCaption this image briefly in objective manner. [/INST]" inputs = processor(prompt, Image.open(image_path), return_tensors="pt").to(device) # autoregressively complete prompt output = model.generate(**inputs, max_new_tokens=100) return processor.decode(output[0], skip_special_tokens=True)["generated_text"][len(text_prompt):] @spaces.GPU def generate_answer_llava(image_path, question): text_prompt =f"[INST] \n{question} [/INST]" inputs = processor(prompt, Image.open(image_path), return_tensors="pt").to(device) output = model.generate(**inputs, max_new_tokens=100) return processor.decode(output[0], skip_special_tokens=True)["generated_text"][len(text_prompt):] title = "# Comparing Vision Language Models" css = """ #mkd { height: 500px; overflow: auto; border: 1px solid #ccc; } """ with gr.Blocks(css=css) as demo: gr.HTML("