hperkins commited on
Commit
f40466f
1 Parent(s): ae2331d

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +24 -36
handler.py CHANGED
@@ -1,46 +1,36 @@
 
 
1
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
2
  from qwen_vl_utils import process_vision_info
3
- import torch
4
- import json
5
 
6
  class EndpointHandler:
7
  def __init__(self, model_dir):
8
- # Set minimum and maximum pixel count for images
9
- min_pixels = 256 * 28 * 28
10
- max_pixels = 1280 * 28 * 28
11
-
12
- # Load model and processor with pixel constraints
13
  self.model = Qwen2VLForConditionalGeneration.from_pretrained(
14
  model_dir,
15
- torch_dtype=torch.float16, # Use FP16 for reduced memory usage
16
- device_map="auto" # Automatically assigns the model to the available GPU(s)
17
  )
18
-
19
- # Load the processor with the new pixel limits for images/videos
20
- self.processor = AutoProcessor.from_pretrained(
21
- model_dir,
22
- min_pixels=min_pixels,
23
- max_pixels=max_pixels
24
- )
25
-
26
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27
  self.model.eval()
28
 
29
  def preprocess(self, request_data):
30
- # Handle image and video input from the request
31
  messages = request_data.get('messages')
32
  if not messages:
33
  raise ValueError("Messages are required")
34
-
35
- # Process vision info (image or video) from the messages
36
  image_inputs, video_inputs = process_vision_info(messages)
37
 
38
- # Prepare text input for the chat model
39
  text = self.processor.apply_chat_template(
40
  messages, tokenize=False, add_generation_prompt=True
41
  )
42
 
43
- # Prepare inputs for the model (text + vision inputs)
44
  inputs = self.processor(
45
  text=[text],
46
  images=image_inputs,
@@ -48,32 +38,27 @@ class EndpointHandler:
48
  padding=True,
49
  return_tensors="pt",
50
  )
51
-
52
  return inputs.to(self.device)
53
 
54
  def inference(self, inputs):
55
- # Perform inference with the model
56
  with torch.no_grad():
57
- # Generate the output with memory-efficient settings
58
  generated_ids = self.model.generate(
59
  **inputs,
60
- max_new_tokens=128, # Limit output length
61
- num_beams=1, # Set beam size to reduce memory consumption
62
- max_batch_size=1 # Set batch size to 1 for memory optimization
63
  )
64
 
65
- # Trim the output (remove input tokens from generated output)
66
  generated_ids_trimmed = [
67
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
68
  ]
69
 
70
- # Clear the CUDA cache after inference to release unused memory
71
- torch.cuda.empty_cache()
72
-
73
  return generated_ids_trimmed
74
 
75
  def postprocess(self, inference_output):
76
- # Decode the generated output from the model
77
  output_text = self.processor.batch_decode(
78
  inference_output, skip_special_tokens=True, clean_up_tokenization_spaces=False
79
  )
@@ -81,13 +66,16 @@ class EndpointHandler:
81
 
82
  def __call__(self, request):
83
  try:
84
- # Parse the JSON request data
85
  request_data = json.loads(request)
86
- # Preprocess the input data (text, images, videos)
 
87
  inputs = self.preprocess(request_data)
 
88
  # Perform inference
89
  outputs = self.inference(inputs)
90
- # Postprocess the output
 
91
  result = self.postprocess(outputs)
92
  return json.dumps({"result": result})
93
  except Exception as e:
 
1
+ import json
2
+ import torch
3
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
4
  from qwen_vl_utils import process_vision_info
5
+
 
6
 
7
  class EndpointHandler:
8
  def __init__(self, model_dir):
9
+ # Load the model and processor for Qwen2-VL
 
 
 
 
10
  self.model = Qwen2VLForConditionalGeneration.from_pretrained(
11
  model_dir,
12
+ torch_dtype=torch.float16, # FP16 for memory efficiency
13
+ device_map="auto" # Automatically assign the model to the available GPU(s)
14
  )
15
+ self.processor = AutoProcessor.from_pretrained(model_dir)
 
 
 
 
 
 
 
16
  self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
  self.model.eval()
18
 
19
  def preprocess(self, request_data):
20
+ # Parse messages, extract video and text inputs
21
  messages = request_data.get('messages')
22
  if not messages:
23
  raise ValueError("Messages are required")
24
+
25
+ # Process vision (video) and text inputs
26
  image_inputs, video_inputs = process_vision_info(messages)
27
 
28
+ # Prepare text input for the model using processor
29
  text = self.processor.apply_chat_template(
30
  messages, tokenize=False, add_generation_prompt=True
31
  )
32
 
33
+ # Create inputs for the model
34
  inputs = self.processor(
35
  text=[text],
36
  images=image_inputs,
 
38
  padding=True,
39
  return_tensors="pt",
40
  )
 
41
  return inputs.to(self.device)
42
 
43
  def inference(self, inputs):
44
+ # Run inference on the model
45
  with torch.no_grad():
 
46
  generated_ids = self.model.generate(
47
  **inputs,
48
+ max_new_tokens=128, # Limit the output length
49
+ num_beams=1, # Reduce memory usage
50
+ max_batch_size=1 # Process one batch at a time
51
  )
52
 
53
+ # Trim generated outputs to remove input tokens
54
  generated_ids_trimmed = [
55
  out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
56
  ]
57
 
 
 
 
58
  return generated_ids_trimmed
59
 
60
  def postprocess(self, inference_output):
61
+ # Decode generated output into human-readable text
62
  output_text = self.processor.batch_decode(
63
  inference_output, skip_special_tokens=True, clean_up_tokenization_spaces=False
64
  )
 
66
 
67
  def __call__(self, request):
68
  try:
69
+ # Parse the incoming request data
70
  request_data = json.loads(request)
71
+
72
+ # Preprocess the input data
73
  inputs = self.preprocess(request_data)
74
+
75
  # Perform inference
76
  outputs = self.inference(inputs)
77
+
78
+ # Postprocess the outputs and return results
79
  result = self.postprocess(outputs)
80
  return json.dumps({"result": result})
81
  except Exception as e: