hperkins commited on
Commit
ae2331d
1 Parent(s): 171cc73

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +23 -13
handler.py CHANGED
@@ -1,20 +1,30 @@
1
- import torch
2
- import json
3
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
4
  from qwen_vl_utils import process_vision_info
 
 
5
 
6
  class EndpointHandler:
7
  def __init__(self, model_dir):
8
- # Load the model and processor for Qwen2-VL-7B
 
 
 
 
9
  self.model = Qwen2VLForConditionalGeneration.from_pretrained(
10
  model_dir,
11
- torch_dtype=torch.float16, # FP16 precision to reduce memory
12
- device_map="auto" # Automatically distribute model across devices
 
 
 
 
 
 
 
13
  )
14
- self.processor = AutoProcessor.from_pretrained(model_dir)
 
15
  self.model.eval()
16
- # Enable gradient checkpointing to save memory
17
- self.model.gradient_checkpointing_enable()
18
 
19
  def preprocess(self, request_data):
20
  # Handle image and video input from the request
@@ -39,17 +49,17 @@ class EndpointHandler:
39
  return_tensors="pt",
40
  )
41
 
42
- return inputs.to("cuda")
43
 
44
  def inference(self, inputs):
45
  # Perform inference with the model
46
  with torch.no_grad():
47
- # Generate the output
48
  generated_ids = self.model.generate(
49
  **inputs,
50
- max_new_tokens=128,
51
- num_beams=1,
52
- max_batch_size=1
53
  )
54
 
55
  # Trim the output (remove input tokens from generated output)
 
 
 
1
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
2
  from qwen_vl_utils import process_vision_info
3
+ import torch
4
+ import json
5
 
6
  class EndpointHandler:
7
  def __init__(self, model_dir):
8
+ # Set minimum and maximum pixel count for images
9
+ min_pixels = 256 * 28 * 28
10
+ max_pixels = 1280 * 28 * 28
11
+
12
+ # Load model and processor with pixel constraints
13
  self.model = Qwen2VLForConditionalGeneration.from_pretrained(
14
  model_dir,
15
+ torch_dtype=torch.float16, # Use FP16 for reduced memory usage
16
+ device_map="auto" # Automatically assigns the model to the available GPU(s)
17
+ )
18
+
19
+ # Load the processor with the new pixel limits for images/videos
20
+ self.processor = AutoProcessor.from_pretrained(
21
+ model_dir,
22
+ min_pixels=min_pixels,
23
+ max_pixels=max_pixels
24
  )
25
+
26
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
  self.model.eval()
 
 
28
 
29
  def preprocess(self, request_data):
30
  # Handle image and video input from the request
 
49
  return_tensors="pt",
50
  )
51
 
52
+ return inputs.to(self.device)
53
 
54
  def inference(self, inputs):
55
  # Perform inference with the model
56
  with torch.no_grad():
57
+ # Generate the output with memory-efficient settings
58
  generated_ids = self.model.generate(
59
  **inputs,
60
+ max_new_tokens=128, # Limit output length
61
+ num_beams=1, # Set beam size to reduce memory consumption
62
+ max_batch_size=1 # Set batch size to 1 for memory optimization
63
  )
64
 
65
  # Trim the output (remove input tokens from generated output)