hperkins commited on
Commit
d67e0d7
1 Parent(s): 057b8f0

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +4 -5
handler.py CHANGED
@@ -5,19 +5,18 @@ import json
5
 
6
  class Qwen2VL7bHandler:
7
  def __init__(self):
8
- # Load the model and processor for Qwen2-VL-7B with FP16 precision and flash attention enabled
9
  self.model = Qwen2VLForConditionalGeneration.from_pretrained(
10
  "Qwen/Qwen2-VL-7B-Instruct",
11
- torch_dtype=torch.float16,
12
- attn_implementation="flash_attention_2", # Enable flash attention for efficiency
13
- device_map="auto" # Automatically assign devices for model
14
  )
15
  self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
16
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
  self.model.to(self.device)
18
  self.model.eval()
19
 
20
- # Enable gradient checkpointing to save memory during inference
21
  self.model.gradient_checkpointing_enable()
22
 
23
  def preprocess(self, request_data):
 
5
 
6
  class Qwen2VL7bHandler:
7
  def __init__(self):
8
+ # Load the model and processor for Qwen2-VL-7B without FlashAttention2
9
  self.model = Qwen2VLForConditionalGeneration.from_pretrained(
10
  "Qwen/Qwen2-VL-7B-Instruct",
11
+ torch_dtype=torch.float16, # Use FP16 for reduced memory usage
12
+ device_map="auto" # Automatically assigns the model to the available GPU(s)
 
13
  )
14
  self.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
15
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
  self.model.to(self.device)
17
  self.model.eval()
18
 
19
+ # Enable gradient checkpointing to save memory
20
  self.model.gradient_checkpointing_enable()
21
 
22
  def preprocess(self, request_data):