hperkins
/

Qwen2-VL-7B-Instruct

Image-Text-to-Text

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

hperkins commited on Sep 4, 2024

Commit

d67e0d7

·

verified ·

1 Parent(s): 057b8f0

Update handler.py

Files changed (1) hide show

handler.py +4 -5

handler.py CHANGED Viewed

@@ -5,19 +5,18 @@ import json
 class Qwen2VL7bHandler:
     def __init__(self):
-        # Load the model and processor for Qwen2-VL-7B with FP16 precision and flash attention enabled
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             "Qwen/Qwen2-VL-7B-Instruct",
-            torch_dtype=torch.float16,
-            attn_implementation="flash_attention_2",  # Enable flash attention for efficiency
-            device_map="auto"  # Automatically assign devices for model
         )
         self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)
         self.model.eval()
-        # Enable gradient checkpointing to save memory during inference
         self.model.gradient_checkpointing_enable()
     def preprocess(self, request_data):

 class Qwen2VL7bHandler:
     def __init__(self):
+        # Load the model and processor for Qwen2-VL-7B without FlashAttention2
         self.model = Qwen2VLForConditionalGeneration.from_pretrained(
             "Qwen/Qwen2-VL-7B-Instruct",
+            torch_dtype=torch.float16,  # Use FP16 for reduced memory usage
+            device_map="auto"  # Automatically assigns the model to the available GPU(s)
         )
         self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.model.to(self.device)
         self.model.eval()
+        # Enable gradient checkpointing to save memory
         self.model.gradient_checkpointing_enable()
     def preprocess(self, request_data):