import time from threading import Thread import gradio as gr import torch from PIL import Image from transformers import AutoProcessor, LlavaForConditionalGeneration from transformers import TextIteratorStreamer import spaces model_id = "xtuner/llava-llama-3-8b-v1_1-transformers" processor = AutoProcessor.from_pretrained(model_id) model = LlavaForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True, ) model.to("cuda:0") model.generation_config.eos_token_id = 128009 @spaces.GPU def bot_streaming(message, history): print(message) image = None if message["files"]: if type(message["files"][-1]) == dict: image = message["files"][-1]["path"] else: image = message["files"][-1] else: for hist in history: if type(hist[0]) == tuple: image = hist[0][0] break if image is None: image = "ignore.png" prompt = f"<|start_header_id|>user<|end_header_id|>\n\n\n{message['text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" image = Image.open(image) inputs = processor(prompt, image, return_tensors='pt').to(0, torch.float16) streamer = TextIteratorStreamer(processor, **{"skip_special_tokens": False, "skip_prompt": True}) generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024, do_sample=False) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() text_prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{message['text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" buffer = "" time.sleep(0.5) for new_text in streamer: if "<|eot_id|>" in new_text: new_text = new_text.split("<|eot_id|>")[0] buffer += new_text generated_text_without_prompt = buffer time.sleep(0.06) yield generated_text_without_prompt chatbot=gr.Chatbot(scale=1) chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image"], placeholder="Enter message or upload file...", show_label=False) with gr.Blocks(fill_height=True, ) as demo: gr.ChatInterface( fn=bot_streaming, stop_btn="Stop Generation", multimodal=True, textbox=chat_input, chatbot=chatbot, ) demo.queue(api_open=False) demo.launch(show_api=False, share=False)