hperkins
/

Qwen2-VL-7B-Instruct

Image-Text-to-Text

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

hperkins commited on Sep 7

Commit

ae2331d

•

1 Parent(s): 171cc73

Update handler.py

Files changed (1) hide show

handler.py +23 -13

handler.py CHANGED Viewed

@@ -1,20 +1,30 @@
-import torch
-import json
 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
 class EndpointHandler:
     def __init__(self, model_dir):
-        # Load the model and processor for Qwen2-VL-7B
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_dir,
-            torch_dtype=torch.float16,  # FP16 precision to reduce memory
-            device_map="auto"  # Automatically distribute model across devices
         )
-        self.processor = AutoProcessor.from_pretrained(model_dir)
         self.model.eval()
-        # Enable gradient checkpointing to save memory
-        self.model.gradient_checkpointing_enable()
     def preprocess(self, request_data):
         # Handle image and video input from the request
@@ -39,17 +49,17 @@ class EndpointHandler:
             return_tensors="pt",
         )
-        return inputs.to("cuda")
     def inference(self, inputs):
         # Perform inference with the model
         with torch.no_grad():
-            # Generate the output
             generated_ids = self.model.generate(
                 **inputs,
-                max_new_tokens=128,
-                num_beams=1,
-                max_batch_size=1
             )
         # Trim the output (remove input tokens from generated output)

 from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
 from qwen_vl_utils import process_vision_info
+import torch
+import json
 class EndpointHandler:
     def __init__(self, model_dir):
+        # Set minimum and maximum pixel count for images
+        min_pixels = 256 * 28 * 28
+        max_pixels = 1280 * 28 * 28
+        # Load model and processor with pixel constraints
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             model_dir,
+            torch_dtype=torch.float16,  # Use FP16 for reduced memory usage
+            device_map="auto"  # Automatically assigns the model to the available GPU(s)
+        )
+        # Load the processor with the new pixel limits for images/videos
+        self.processor = AutoProcessor.from_pretrained(
+            model_dir,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels
         )
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.eval()
     def preprocess(self, request_data):
         # Handle image and video input from the request
             return_tensors="pt",
         )
+        return inputs.to(self.device)
     def inference(self, inputs):
         # Perform inference with the model
         with torch.no_grad():
+            # Generate the output with memory-efficient settings
             generated_ids = self.model.generate(
                 **inputs,
+                max_new_tokens=128,  # Limit output length
+                num_beams=1,  # Set beam size to reduce memory consumption
+                max_batch_size=1  # Set batch size to 1 for memory optimization
             )
         # Trim the output (remove input tokens from generated output)