QwenVL2Demo / qwenvl.py
LOpeetu's picture
Upload 3 files
b08e04f verified
raw
history blame
3.52 kB
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import spaces
class inputParent():
def __init__(self, source_path, raw_data):
self.sourcePath = source_path
self.rawData = raw_data
def __call__(self):
return self.rawData
class imageInput(inputParent):
def __init__(self, source_path, raw_data):
super().__init__(source_path, raw_data)
class videoInput(inputParent):
def __init__(self, source_path, raw_data):
super().__init__(source_path, raw_data)
class textInput(inputParent):
def __init__(self, source_path, raw_data):
super().__init__(source_path, raw_data)
class QwenVLModel():
def __init__(self,
model = 'Qwen/Qwen2-VL-7B-Instruct',
device_map = 'auto'):
self.modelName = model
self.deviceMap = device_map
self.model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", device_map="auto")
self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
self.conversation = []
self.verbose = True
def addToConversation(self, inputs, role='user'):
self.conversation.append(
{
'role': role,
'content': []
}
)
for _input in inputs:
if _input is imageInput:
self.conversation[-1][
'content'
].append(
{
'type': 'image'
}
)
if _input is videoInput:
self.conversation[-1][
'content'
].append(
{
'type': 'video'
}
)
if _input is textInput:
self.conversation[-1][
'content'
].append(
{
'type': 'text',
'content': _input()
}
)
@spaces.GPU
def oneImagecall(self, image_input: Image.Image, user_input):
inputs = [imageInput(image_input), textInput(user_input)]
self.addToConversation(inputs=inputs)
# Preprocess the inputs
text_prompt = self.processor.apply_chat_template(self.conversation, add_generation_prompt=True)
# Excepted output: '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe this image.<|im_end|>\n<|im_start|>assistant\n'
inputs = self.processor(text=[text_prompt], images=[inputs[0]()], padding=True, return_tensors="pt")
inputs = inputs.to('cpu')
# Inference: Generation of the output
output_ids = self.model.generate(**inputs, max_new_tokens=128)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
if self.verbose:
print(output_text)
return output_text