File size: 2,608 Bytes
f04732f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69f9849
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f04732f
69f9849
 
 
 
 
 
 
 
f04732f
 
3edc738
f04732f
3edc738
f04732f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import gradio as gr
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, TextIteratorStreamer
from threading import Thread
import re
import time 
from PIL import Image
import torch
import spaces

processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
model.to("cuda:0")

@spaces.GPU
def bot_streaming(message, history):
    chat_history = []
    if message["files"]:
      image = message["files"][-1]["path"]
    else:
      for hist in history:
        if type(hist[0])==tuple:
          image = hist[0][0]
            
    if len(history) > 0 and image:
        chat_history.append({"role": "user", "content": f'<image>\n{message['text']}'})
        for human, assistant in history[1:]:
            chat_history.append({"role": "user", "content": human })
            chat_history.append({"role": "assistant", "content": assistant })
            
    if image is None:
        gr.Error("You need to upload an image for LLaVA to work.")
    prompt=f"[INST] <image>\n{message['text']} [/INST]"
    image = Image.open(image).convert("RGB")
    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
    streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": True})
    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=100)
    generated_text = ""
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    text_prompt =f"[INST]  \n{message['text']} [/INST]"
    
    buffer = ""
    for new_text in streamer:
      
      buffer += new_text
      
      generated_text_without_prompt = buffer[len(text_prompt):]
      time.sleep(0.04)
      yield generated_text_without_prompt


demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA NeXT", examples=[{"text": "What is on the flower?", "files":["./bee.jpg"]},
                                                                      {"text": "How to make this pastry?", "files":["./baklava.png"]}], 
                        description="Try [LLaVA NeXT](https://huggingface.co/docs/transformers/main/en/model_doc/llava_next) in this demo (more specifically, the [Mistral-7B variant](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
                        stop_btn="Stop Generation", multimodal=True)
demo.launch(debug=True)