hperkins
/

Qwen2-VL-7B-Instruct

@@ -2,37 +2,30 @@ from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import torch
 import json
-import os
-# Set the environment variable to handle memory fragmentation
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 class EndpointHandler:
     def __init__(self, model_dir):
-        # Load the model with automatic device dispatching
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_dir,
-            torch_dtype=torch.float16,  # Use FP16 for memory efficiency
-            device_map="auto",  # Auto device dispatch across available GPUs
-            low_cpu_mem_usage=True  # Minimize CPU memory usage
         )
         self.processor = AutoProcessor.from_pretrained(model_dir)
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # No need to move model to device manually; device_map handles it
         self.model.eval()
-        # Enable gradient checkpointing for further memory optimization
         self.model.gradient_checkpointing_enable()
     def preprocess(self, request_data):
-        # Handle the request and extract vision data (images, videos)
         messages = request_data.get('messages')
         if not messages:
             raise ValueError("Messages are required")
-        # Process vision input from the messages
         image_inputs, video_inputs = process_vision_info(messages)
         # Prepare text input for the chat model
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
@@ -47,30 +40,31 @@ class EndpointHandler:
             return_tensors="pt",
         )
-        return inputs.to(self.device)
     def inference(self, inputs):
-        # Perform inference using memory-efficient settings
         with torch.no_grad():
             generated_ids = self.model.generate(
-                **inputs,
-                max_new_tokens=64,  # Reduce max tokens for memory optimization
-                num_beams=1,  # Reduce beam size to save memory
-                max_batch_size=1  # Keep batch size small to minimize memory usage
             )
-        # Trim the output by removing input tokens from the generated output
         generated_ids_trimmed = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
-        # Clear CUDA memory cache after inference to free up memory
         torch.cuda.empty_cache()
         return generated_ids_trimmed
     def postprocess(self, inference_output):
-        # Decode the model's output into human-readable text
         output_text = self.processor.batch_decode(
             inference_output, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
@@ -78,13 +72,13 @@ class EndpointHandler:
     def __call__(self, request):
         try:
-            # Parse the JSON request
             request_data = json.loads(request)
-            # Preprocess the input data
             inputs = self.preprocess(request_data)
             # Perform inference
             outputs = self.inference(inputs)
-            # Postprocess the output and return the result
             result = self.postprocess(outputs)
             return json.dumps({"result": result})
         except Exception as e:

 from qwen_vl_utils import process_vision_info
 import torch
 import json
 class EndpointHandler:
     def __init__(self, model_dir):
+        # Load the model and processor for Qwen2-VL-7B
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_dir,
+            torch_dtype=torch.float32,  # Use float16 for reduced memory usage
+            device_map="auto"  # Automatically assign to available GPU(s)
         )
         self.processor = AutoProcessor.from_pretrained(model_dir)
         self.model.eval()
+        # Enable gradient checkpointing for memory savings
         self.model.gradient_checkpointing_enable()
     def preprocess(self, request_data):
+        # Handle image and video input from the request
         messages = request_data.get('messages')
         if not messages:
             raise ValueError("Messages are required")
+        # Process vision info (image or video) from the messages
         image_inputs, video_inputs = process_vision_info(messages)
         # Prepare text input for the chat model
         text = self.processor.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
             return_tensors="pt",
         )
+        return inputs.to(self.model.device)
     def inference(self, inputs):
+        # Perform inference with the model
         with torch.no_grad():
             generated_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=256,  # Increased token length for richer output
+                num_beams=5,  # Increase beam size for better quality
+                early_stopping=True,  # Stop when all beams have finished
+                max_batch_size=1  # Keep batch size small to manage memory usage
             )
+        # Trim the output (remove input tokens from generated output)
         generated_ids_trimmed = [
             out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
+        # Clear the CUDA cache after inference to release unused memory
         torch.cuda.empty_cache()
         return generated_ids_trimmed
     def postprocess(self, inference_output):
+        # Decode the generated output from the model
         output_text = self.processor.batch_decode(
             inference_output, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
     def __call__(self, request):
         try:
+            # Parse the JSON request data
             request_data = json.loads(request)
+            # Preprocess the input data (text, images, videos)
             inputs = self.preprocess(request_data)
             # Perform inference
             outputs = self.inference(inputs)
+            # Postprocess the output
             result = self.postprocess(outputs)
             return json.dumps({"result": result})
         except Exception as e: