File size: 3,197 Bytes
f40466f b45af94 171cc73 f40466f 6b10bcf b150b57 ae2331d 057b8f0 b150b57 b45af94 d9da728 b45af94 b150b57 d9da728 ed47265 b150b57 ed47265 b150b57 f40466f b150b57 ed47265 b150b57 f43b9bc b150b57 ed47265 b150b57 33ce564 b150b57 33ce564 d9da728 b150b57 d9da728 057b8f0 ed47265 d9da728 33ce564 b150b57 d9da728 33ce564 6b10bcf 33ce564 b150b57 171cc73 b150b57 33ce564 d9da728 b150b57 d9da728 b150b57 d9da728 33ce564 b150b57 b45af94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
import json
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, pipeline, PreTrainedImageProcessor
from qwen_vl_utils import process_vision_info
class EndpointHandler:
def __init__(self, model_dir):
# Configure device settings
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
try:
# Load the model with automatic device mapping and memory-efficient precision
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
model_dir,
torch_dtype=torch.float16, # Use half-precision for better GPU use
device_map="auto" # Automatically map model to GPU(s)
)
self.model.to(self.device)
except Exception as e:
print(f"Error loading model: {e}")
raise
try:
# Initialize processor
self.processor = AutoProcessor.from_pretrained(model_dir)
except Exception as e:
print(f"Error loading processor: {e}")
raise
# Define a VQA pipeline with explicitly provided processor
self.vqa_pipeline = pipeline(
task="visual-question-answering",
model=self.model,
image_processor=self.processor, # Explicitly pass the image processor
device=0 if torch.cuda.is_available() else -1 # Use first GPU or CPU
)
def preprocess(self, request_data):
# Extract messages
messages = request_data.get('messages')
if not messages:
raise ValueError("Missing 'messages' in request data.")
# Process visual and text inputs
image_inputs, video_inputs = process_vision_info(messages)
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Prepare inputs for the model
inputs = self.processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt"
).to(self.device)
return inputs
def inference(self, inputs):
# Execute model inference without gradient computation
with torch.no_grad():
result = self.vqa_pipeline(
images=inputs.get("images", None),
videos=inputs.get("videos", None),
question=inputs["text"]
)
return result
def postprocess(self, inference_output):
# Serialize inference result to JSON
return json.dumps(inference_output)
def __call__(self, request):
try:
# Parse the incoming request
request_data = json.loads(request)
# Preprocess input data
inputs = self.preprocess(request_data)
# Perform inference
result = self.inference(inputs)
# Return postprocessed result
return self.postprocess(result)
except Exception as e:
error_message = f"Error: {str(e)}"
print(error_message)
return json.dumps({"error": error_message}) |