hperkins
/

Qwen2-VL-7B-Instruct

@@ -1,46 +1,36 @@
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
-import torch
-import json
 class EndpointHandler:
     def __init__(self, model_dir):
-        # Set minimum and maximum pixel count for images
-        min_pixels = 256 * 28 * 28
-        max_pixels = 1280 * 28 * 28
-        # Load model and processor with pixel constraints
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_dir,
-            torch_dtype=torch.float16,  # Use FP16 for reduced memory usage
-            device_map="auto"  # Automatically assigns the model to the available GPU(s)
         )
-        # Load the processor with the new pixel limits for images/videos
-        self.processor = AutoProcessor.from_pretrained(
-            model_dir,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels
-        )
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.eval()
     def preprocess(self, request_data):
-        # Handle image and video input from the request
         messages = request_data.get('messages')
         if not messages:
             raise ValueError("Messages are required")
-        # Process vision info (image or video) from the messages
         image_inputs, video_inputs = process_vision_info(messages)
-        # Prepare text input for the chat model
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
-        # Prepare inputs for the model (text + vision inputs)
         inputs = self.processor(
             text=[text],
             images=image_inputs,
@@ -48,32 +38,27 @@ class EndpointHandler:
             padding=True,
             return_tensors="pt",
         )
         return inputs.to(self.device)
     def inference(self, inputs):
-        # Perform inference with the model
         with torch.no_grad():
-            # Generate the output with memory-efficient settings
             generated_ids = self.model.generate(
                 **inputs,
-                max_new_tokens=128,  # Limit output length
-                num_beams=1,  # Set beam size to reduce memory consumption
-                max_batch_size=1  # Set batch size to 1 for memory optimization
             )
-        # Trim the output (remove input tokens from generated output)
         generated_ids_trimmed = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
-        # Clear the CUDA cache after inference to release unused memory
-        torch.cuda.empty_cache()
         return generated_ids_trimmed
     def postprocess(self, inference_output):
-        # Decode the generated output from the model
         output_text = self.processor.batch_decode(
             inference_output, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
@@ -81,13 +66,16 @@ class EndpointHandler:
     def __call__(self, request):
         try:
-            # Parse the JSON request data
             request_data = json.loads(request)
-            # Preprocess the input data (text, images, videos)
             inputs = self.preprocess(request_data)
             # Perform inference
             outputs = self.inference(inputs)
-            # Postprocess the output
             result = self.postprocess(outputs)
             return json.dumps({"result": result})
         except Exception as e:

+import json
+import torch
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 class EndpointHandler:
     def __init__(self, model_dir):
+        # Load the model and processor for Qwen2-VL
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_dir,
+            torch_dtype=torch.float16,  # FP16 for memory efficiency
+            device_map="auto"  # Automatically assign the model to the available GPU(s)
         )
+        self.processor = AutoProcessor.from_pretrained(model_dir)
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.eval()
     def preprocess(self, request_data):
+        # Parse messages, extract video and text inputs
         messages = request_data.get('messages')
         if not messages:
             raise ValueError("Messages are required")
+        # Process vision (video) and text inputs
         image_inputs, video_inputs = process_vision_info(messages)
+        # Prepare text input for the model using processor
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
         )
+        # Create inputs for the model
         inputs = self.processor(
             text=[text],
             images=image_inputs,
             padding=True,
             return_tensors="pt",
         )
         return inputs.to(self.device)
     def inference(self, inputs):
+        # Run inference on the model
         with torch.no_grad():
             generated_ids = self.model.generate(
                 **inputs,
+                max_new_tokens=128,  # Limit the output length
+                num_beams=1,  # Reduce memory usage
+                max_batch_size=1  # Process one batch at a time
             )
+        # Trim generated outputs to remove input tokens
         generated_ids_trimmed = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         return generated_ids_trimmed
     def postprocess(self, inference_output):
+        # Decode generated output into human-readable text
         output_text = self.processor.batch_decode(
             inference_output, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
     def __call__(self, request):
         try:
+            # Parse the incoming request data
             request_data = json.loads(request)
+            # Preprocess the input data
             inputs = self.preprocess(request_data)
             # Perform inference
             outputs = self.inference(inputs)
+            # Postprocess the outputs and return results
             result = self.postprocess(outputs)
             return json.dumps({"result": result})
         except Exception as e: