hperkins
/

Qwen2-VL-7B-Instruct

Image-Text-to-Text

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

hperkins commited on Sep 4

Commit

6b10bcf

•

1 Parent(s): d67e0d7

Update handler.py

Files changed (1) hide show

handler.py +6 -12

handler.py CHANGED Viewed

@@ -3,15 +3,15 @@ from qwen_vl_utils import process_vision_info
 import torch
 import json
-class Qwen2VL7bHandler:
-    def __init__(self):
         # Load the model and processor for Qwen2-VL-7B without FlashAttention2
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2-VL-7B-Instruct",
             torch_dtype=torch.float16,  # Use FP16 for reduced memory usage
             device_map="auto"  # Automatically assigns the model to the available GPU(s)
         )
-        self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)
         self.model.eval()
@@ -72,7 +72,7 @@ class Qwen2VL7bHandler:
         )
         return output_text
-    def handle(self, request):
         try:
             # Parse the JSON request data
             request_data = json.loads(request)
@@ -84,10 +84,4 @@ class Qwen2VL7bHandler:
             result = self.postprocess(outputs)
             return json.dumps({"result": result})
         except Exception as e:
-            return json.dumps({"error": str(e)})
-# Instantiate the handler for deployment
-_service = Qwen2VL7bHandler()
-def handle(request):
-    return _service.handle(request)

 import torch
 import json
+class EndpointHandler:
+    def __init__(self, model_dir):
         # Load the model and processor for Qwen2-VL-7B without FlashAttention2
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_dir,
             torch_dtype=torch.float16,  # Use FP16 for reduced memory usage
             device_map="auto"  # Automatically assigns the model to the available GPU(s)
         )
+        self.processor = AutoProcessor.from_pretrained(model_dir)
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)
         self.model.eval()
         )
         return output_text
+    def __call__(self, request):
         try:
             # Parse the JSON request data
             request_data = json.loads(request)
             result = self.postprocess(outputs)
             return json.dumps({"result": result})
         except Exception as e:
+            return json.dumps({"error": str(e)})