Spaces:

LOpeetu
/

QwenVL2Demo

Runtime error

App Files Files Community

LOpeetu commited on 17 days ago

Commit

b08e04f

•

1 Parent(s): 51d098c

Upload 3 files

Browse files

Files changed (3) hide show

app.py +50 -0
qwenvl.py +106 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from qwenvl import QwenVLModel
+import gradio as gr
+from PIL import Image
+import datetime
+import os
+import numpy as np
+model = QwenVLModel()
+DESCRIPTION = "[Qwen2-VL-7B Demo](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)"
+def array_to_image_path(image_array):
+    # Convert numpy array to PIL Image
+    img = Image.fromarray(np.uint8(image_array))
+    # Generate a unique filename using timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"image_{timestamp}.png"
+    # Save the image
+    img.save(filename)
+    # Get the full path of the saved image
+    full_path = os.path.abspath(filename)
+    return full_path
+css = """
+  #output {
+    height: 500px;
+    overflow: auto;
+    border: 1px solid #ccc;
+  }
+"""
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Tab(label="Qwen2-VL-7B Input"):
+        with gr.Row():
+            with gr.Column():
+                input_img_arr = gr.Image(label="Input Picture")
+                input_img = Image.fromarray(input_img_arr).convert("RGB")
+                text_input = gr.Textbox(label="Question")
+                submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                output_text = gr.Textbox(label="Output Text")
+        submit_btn.click(model.oneImagecall, [input_img, text_input], [output_text])
+demo.queue(api_open=False)
+demo.launch(debug=True)

qwenvl.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from PIL import Image
+import requests
+import torch
+from torchvision import io
+from typing import Dict
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+import spaces
+class inputParent():
+    def __init__(self, source_path, raw_data):
+        self.sourcePath = source_path
+        self.rawData = raw_data
+    def __call__(self):
+        return self.rawData
+class imageInput(inputParent):
+    def __init__(self, source_path, raw_data):
+        super().__init__(source_path, raw_data)
+class videoInput(inputParent):
+    def __init__(self, source_path, raw_data):
+        super().__init__(source_path, raw_data)
+class textInput(inputParent):
+    def __init__(self, source_path, raw_data):
+        super().__init__(source_path, raw_data)
+class QwenVLModel():
+    def __init__(self,
+                 model = 'Qwen/Qwen2-VL-7B-Instruct',
+                 device_map = 'auto'):
+        self.modelName = model
+        self.deviceMap = device_map
+        self.model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", device_map="auto")
+        self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+        self.conversation = []
+        self.verbose = True
+    def addToConversation(self, inputs, role='user'):
+        self.conversation.append(
+            {
+                'role': role,
+                'content': []
+            }
+        )
+        for _input in inputs:
+            if _input is imageInput:
+                self.conversation[-1][
+                    'content'
+                ].append(
+                    {
+                        'type': 'image'
+                    }
+                )
+            if _input is videoInput:
+                self.conversation[-1][
+                    'content'
+                ].append(
+                    {
+                        'type': 'video'
+                    }
+                )
+            if _input is textInput:
+                self.conversation[-1][
+                    'content'
+                ].append(
+                    {
+                        'type': 'text',
+                        'content': _input()
+                    }
+                )
+    @spaces.GPU
+    def oneImagecall(self, image_input: Image.Image, user_input):
+        inputs = [imageInput(image_input), textInput(user_input)]
+        self.addToConversation(inputs=inputs)
+        # Preprocess the inputs
+        text_prompt = self.processor.apply_chat_template(self.conversation, add_generation_prompt=True)
+        # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
+        inputs = self.processor(text=[text_prompt], images=[inputs[0]()], padding=True, return_tensors="pt")
+        inputs = inputs.to('cpu')
+        # Inference: Generation of the output
+        output_ids = self.model.generate(**inputs, max_new_tokens=128)
+        generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+        output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+        if self.verbose:
+            print(output_text)
+        return output_text

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+numpy==1.24.4
+Pillow==10.3.0
+Requests==2.31.0
+torch
+torchvision
+git+https://github.com/huggingface/transformers.git
+accelerate