LOpeetu commited on
Commit
b08e04f
1 Parent(s): 51d098c

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +50 -0
  2. qwenvl.py +106 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from qwenvl import QwenVLModel
2
+ import gradio as gr
3
+ from PIL import Image
4
+ import datetime
5
+ import os
6
+ import numpy as np
7
+
8
+ model = QwenVLModel()
9
+ DESCRIPTION = "[Qwen2-VL-7B Demo](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)"
10
+
11
+ def array_to_image_path(image_array):
12
+ # Convert numpy array to PIL Image
13
+ img = Image.fromarray(np.uint8(image_array))
14
+
15
+ # Generate a unique filename using timestamp
16
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
17
+ filename = f"image_{timestamp}.png"
18
+
19
+ # Save the image
20
+ img.save(filename)
21
+
22
+ # Get the full path of the saved image
23
+ full_path = os.path.abspath(filename)
24
+
25
+ return full_path
26
+
27
+ css = """
28
+ #output {
29
+ height: 500px;
30
+ overflow: auto;
31
+ border: 1px solid #ccc;
32
+ }
33
+ """
34
+
35
+ with gr.Blocks(css=css) as demo:
36
+ gr.Markdown(DESCRIPTION)
37
+ with gr.Tab(label="Qwen2-VL-7B Input"):
38
+ with gr.Row():
39
+ with gr.Column():
40
+ input_img_arr = gr.Image(label="Input Picture")
41
+ input_img = Image.fromarray(input_img_arr).convert("RGB")
42
+ text_input = gr.Textbox(label="Question")
43
+ submit_btn = gr.Button(value="Submit")
44
+ with gr.Column():
45
+ output_text = gr.Textbox(label="Output Text")
46
+
47
+ submit_btn.click(model.oneImagecall, [input_img, text_input], [output_text])
48
+
49
+ demo.queue(api_open=False)
50
+ demo.launch(debug=True)
qwenvl.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import requests
3
+ import torch
4
+ from torchvision import io
5
+ from typing import Dict
6
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
7
+ import spaces
8
+
9
+ class inputParent():
10
+ def __init__(self, source_path, raw_data):
11
+ self.sourcePath = source_path
12
+ self.rawData = raw_data
13
+
14
+ def __call__(self):
15
+ return self.rawData
16
+
17
+ class imageInput(inputParent):
18
+ def __init__(self, source_path, raw_data):
19
+ super().__init__(source_path, raw_data)
20
+
21
+ class videoInput(inputParent):
22
+ def __init__(self, source_path, raw_data):
23
+ super().__init__(source_path, raw_data)
24
+
25
+
26
+ class textInput(inputParent):
27
+ def __init__(self, source_path, raw_data):
28
+ super().__init__(source_path, raw_data)
29
+
30
+
31
+
32
+
33
+ class QwenVLModel():
34
+ def __init__(self,
35
+ model = 'Qwen/Qwen2-VL-7B-Instruct',
36
+ device_map = 'auto'):
37
+ self.modelName = model
38
+ self.deviceMap = device_map
39
+
40
+ self.model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", device_map="auto")
41
+ self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
42
+ self.conversation = []
43
+ self.verbose = True
44
+
45
+ def addToConversation(self, inputs, role='user'):
46
+ self.conversation.append(
47
+ {
48
+ 'role': role,
49
+ 'content': []
50
+ }
51
+ )
52
+
53
+ for _input in inputs:
54
+ if _input is imageInput:
55
+ self.conversation[-1][
56
+ 'content'
57
+ ].append(
58
+ {
59
+ 'type': 'image'
60
+ }
61
+ )
62
+
63
+ if _input is videoInput:
64
+ self.conversation[-1][
65
+ 'content'
66
+ ].append(
67
+ {
68
+ 'type': 'video'
69
+ }
70
+ )
71
+
72
+ if _input is textInput:
73
+ self.conversation[-1][
74
+ 'content'
75
+ ].append(
76
+ {
77
+ 'type': 'text',
78
+ 'content': _input()
79
+ }
80
+ )
81
+
82
+
83
+ @spaces.GPU
84
+ def oneImagecall(self, image_input: Image.Image, user_input):
85
+ inputs = [imageInput(image_input), textInput(user_input)]
86
+
87
+ self.addToConversation(inputs=inputs)
88
+
89
+ # Preprocess the inputs
90
+ text_prompt = self.processor.apply_chat_template(self.conversation, add_generation_prompt=True)
91
+ # Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
92
+
93
+ inputs = self.processor(text=[text_prompt], images=[inputs[0]()], padding=True, return_tensors="pt")
94
+ inputs = inputs.to('cpu')
95
+
96
+ # Inference: Generation of the output
97
+ output_ids = self.model.generate(**inputs, max_new_tokens=128)
98
+ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
99
+ output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
100
+ if self.verbose:
101
+ print(output_text)
102
+
103
+ return output_text
104
+
105
+
106
+
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ numpy==1.24.4
2
+ Pillow==10.3.0
3
+ Requests==2.31.0
4
+ torch
5
+ torchvision
6
+ git+https://github.com/huggingface/transformers.git
7
+ accelerate