vlm-playground / app.py
edbeeching's picture
edbeeching HF staff
Update app.py
17ec0aa verified
import gradio as gr
from transformers import LlavaProcessor, LlavaForConditionalGeneration, TextIteratorStreamer
from threading import Thread
import re
import time
from PIL import Image
import torch
import spaces
import os
from huggingface_hub import login
login(token=os.environ["HF_TOKEN"])
MODEL_ID = os.environ["MODEL_ID"]
REVISION = os.environ["MODEL_REVISION"]
processor = LlavaProcessor.from_pretrained(MODEL_ID, revision=REVISION)
model = LlavaForConditionalGeneration.from_pretrained(MODEL_ID, revision=REVISION, torch_dtype=torch.float16, low_cpu_mem_usage=True)
model.to("cuda:0")
@spaces.GPU
def bot_streaming(message, history):
print(message)
if message["files"]:
image = message["files"][-1]["path"]
else:
# if there's no image uploaded for this turn, look for images in the past turns
# kept inside tuples, take the last one
for hist in history:
if type(hist[0])==tuple:
image = hist[0][0]
if image is None:
gr.Error("You need to upload an image for LLaVA to work.")
prompt=f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{message['text']}\nASSISTANT:" #f"[INST] <image>\n{message['text']} [/INST]"
image = Image.open(image).convert("RGB")
inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": True})
generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=512)
generated_text = ""
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
text_prompt =f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \n{message['text']}\nASSISTANT: " #f"[INST] \n{message['text']} [/INST]"
buffer = ""
for new_text in streamer:
buffer += new_text
generated_text_without_prompt = buffer[len(text_prompt):]
time.sleep(0.04)
yield generated_text_without_prompt
demo = gr.ChatInterface(fn=bot_streaming, title="VLM Playground", examples=[{"text": "What is on the flower?", "files":["./bee.jpg"]},
{"text": "How to make this pastry?", "files":["./baklava.png"]},
{"text": "What is this?", "files":["./pizza2.jpeg"]}],
description="VLM Playground host HuggingFaceH4/vsft-llava-1.5-7b-hf-trl a llava SFT finetune using TRL's SFTTrainer", #for internal VLMs. Change the model ID and revision under the environments of the Space settings.
stop_btn="Stop Generation", multimodal=True)
demo.launch(debug=True)