future-xy commited on
Commit
1ae96c8
1 Parent(s): 581fdbd
backend-cli.py CHANGED
@@ -12,7 +12,7 @@ from src.backend.run_eval_suite import run_evaluation
12
  from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
13
  from src.backend.sort_queue import sort_models_by_priority
14
  from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
15
-
16
  from src.backend.manage_requests import EvalRequest
17
  from src.leaderboard.read_evals import EvalResult
18
 
@@ -124,7 +124,7 @@ def request_to_result_name(request: EvalRequest) -> str:
124
 
125
 
126
  def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
127
- batch_size = 4
128
  try:
129
  results = run_evaluation(
130
  eval_request=eval_request,
@@ -404,7 +404,8 @@ if __name__ == "__main__":
404
  local_debug = args.debug
405
  # debug specific task by ping
406
  if local_debug:
407
- debug_model_names = ["mistralai/Mixtral-8x7B-Instruct-v0.1"]
 
408
  # debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
409
  debug_task_name = 'selfcheckgpt'
410
  # debug_task_name = "mmlu"
@@ -415,7 +416,7 @@ if __name__ == "__main__":
415
  if task_name != debug_task_name:
416
  continue
417
  eval_request = EvalRequest(
418
- model=debug_model_name, private=False, status="", json_filepath="", precision="float16"
419
  )
420
  results = process_evaluation(task, eval_request)
421
  else:
 
12
  from src.backend.manage_requests import check_completed_evals, get_eval_requests, set_eval_request
13
  from src.backend.sort_queue import sort_models_by_priority
14
  from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PATH_BACKEND, DEVICE, LIMIT, Task
15
+ LIMIT=2
16
  from src.backend.manage_requests import EvalRequest
17
  from src.leaderboard.read_evals import EvalResult
18
 
 
124
 
125
 
126
  def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
127
+ batch_size = 1
128
  try:
129
  results = run_evaluation(
130
  eval_request=eval_request,
 
404
  local_debug = args.debug
405
  # debug specific task by ping
406
  if local_debug:
407
+ # debug_model_names = ["mistralai/Mixtral-8x7B-Instruct-v0.1"]
408
+ debug_model_names = ["facebook/opt-1.3b"]
409
  # debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
410
  debug_task_name = 'selfcheckgpt'
411
  # debug_task_name = "mmlu"
 
416
  if task_name != debug_task_name:
417
  continue
418
  eval_request = EvalRequest(
419
+ model=debug_model_name, private=False, status="", json_filepath="", precision="float16", inference_framework="hf-chat"
420
  )
421
  results = process_evaluation(task, eval_request)
422
  else:
src/backend/hflm_with_measurement.py CHANGED
@@ -1,7 +1,7 @@
1
  import copy
2
  import os
3
  from datetime import timedelta
4
- import random
5
  from pathlib import Path
6
  from typing import List, Literal, Optional, Tuple, Union
7
 
@@ -22,6 +22,7 @@ from transformers.models.auto.modeling_auto import (
22
  MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
23
  MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
24
  )
 
25
 
26
  from lm_eval import utils
27
  from lm_eval.api.instance import Instance
@@ -37,6 +38,31 @@ from lm_eval.models.utils import (
37
  from lm_eval.models.huggingface import HFLM
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  class HFLMWithMeasurement(HFLM):
41
  def __init__(self, **kwargs):
42
  super().__init__(**kwargs)
@@ -59,14 +85,27 @@ class HFLMWithMeasurement(HFLM):
59
  stopping_criteria = stop_sequences_criteria(
60
  self.tokenizer, stop, context.shape[1], context.shape[0]
61
  )
62
- return self.model.generate(
 
 
63
  input_ids=context,
64
  max_length=max_length,
65
  stopping_criteria=stopping_criteria,
66
  pad_token_id=self.tokenizer.pad_token_id,
67
  use_cache=True,
 
68
  **generation_kwargs,
69
  )
 
 
 
 
 
 
 
 
 
 
70
 
71
  def generate_until(
72
  self, requests: List[Instance], disable_tqdm: bool = False
@@ -174,7 +213,7 @@ class HFLMWithMeasurement(HFLM):
174
  kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
175
 
176
  # perform batched generation
177
- cont = self._model_generate(
178
  context=context_enc,
179
  attention_mask=attn_masks,
180
  stop=until,
@@ -196,7 +235,7 @@ class HFLMWithMeasurement(HFLM):
196
  # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
197
  s = s.split(term)[0]
198
 
199
- res.append((s, random.random()))
200
 
201
  self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
202
  pbar.update(1)
 
1
  import copy
2
  import os
3
  from datetime import timedelta
4
+ from time import time
5
  from pathlib import Path
6
  from typing import List, Literal, Optional, Tuple, Union
7
 
 
22
  MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,
23
  MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
24
  )
25
+ from transformers import TextStreamer
26
 
27
  from lm_eval import utils
28
  from lm_eval.api.instance import Instance
 
38
  from lm_eval.models.huggingface import HFLM
39
 
40
 
41
+ class StopWatch(TextStreamer):
42
+ def __init__(self, *args, **kwargs):
43
+ super().__init__(*args, **kwargs)
44
+ self.start_prefilling = None
45
+ self.prefilling_time = None
46
+ self.start_decoding = None
47
+ self.decoding_time = None
48
+ self.decoding_iterations = 0
49
+
50
+ def put(self, value):
51
+ if self.start_prefilling is None:
52
+ self.start_prefilling = time()
53
+ return
54
+ elif self.prefilling_time is None:
55
+ self.prefilling_time = time() - self.start_prefilling
56
+ self.start_decoding = time()
57
+ self.decoding_iterations += 1
58
+ return
59
+
60
+ def end(self):
61
+ if self.decoding_time is None and self.start_decoding is not None:
62
+ self.decoding_time = time() - self.start_decoding
63
+ return
64
+
65
+
66
  class HFLMWithMeasurement(HFLM):
67
  def __init__(self, **kwargs):
68
  super().__init__(**kwargs)
 
85
  stopping_criteria = stop_sequences_criteria(
86
  self.tokenizer, stop, context.shape[1], context.shape[0]
87
  )
88
+ stop_watch = StopWatch(self.tokenizer)
89
+ start = time()
90
+ res = self.model.generate(
91
  input_ids=context,
92
  max_length=max_length,
93
  stopping_criteria=stopping_criteria,
94
  pad_token_id=self.tokenizer.pad_token_id,
95
  use_cache=True,
96
+ streamer=stop_watch,
97
  **generation_kwargs,
98
  )
99
+ end = time()
100
+
101
+ batch_size = context.shape[0]
102
+ output_length = stop_watch.decoding_iterations
103
+
104
+ end_to_end_time = (end - start) / batch_size
105
+ prefilling_time = stop_watch.prefilling_time / batch_size
106
+ decoding_time = stop_watch.decoding_time / batch_size
107
+ token_per_sec = output_length / decoding_time
108
+ return res, end_to_end_time, prefilling_time, token_per_sec
109
 
110
  def generate_until(
111
  self, requests: List[Instance], disable_tqdm: bool = False
 
213
  kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
214
 
215
  # perform batched generation
216
+ cont, end_to_end_time, prefilling_time, token_per_sec = self._model_generate(
217
  context=context_enc,
218
  attention_mask=attn_masks,
219
  stop=until,
 
235
  # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
236
  s = s.split(term)[0]
237
 
238
+ res.append((s, end_to_end_time, prefilling_time, token_per_sec))
239
 
240
  self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
241
  pbar.update(1)
src/backend/tasks/measurement_task_utils.py CHANGED
@@ -9,12 +9,20 @@ def process_results_decorator(func):
9
  # We process the results here
10
  processed_results = [r[0] for r in results]
11
 
12
- latency = sum([r[1] for r in results]) / len(results)
13
- print(f"Average latency: {latency}")
 
 
 
 
 
 
14
 
15
  # Now call the original process_results with the processed results
16
  result_dict = func(self, doc, processed_results, *args, **kwargs)
17
- result_dict["latency"] = latency
 
 
18
  return result_dict
19
  return wrapper
20
 
@@ -23,7 +31,9 @@ def aggregation_decorator(func):
23
  @functools.wraps(func)
24
  def wrapper(self, *args, **kwargs):
25
  aggregation_list = func(self, *args, **kwargs)
26
- aggregation_list["latency"] = mean
 
 
27
  return aggregation_list
28
  return wrapper
29
 
@@ -32,7 +42,9 @@ def higher_is_better_decorator(func):
32
  @functools.wraps(func)
33
  def wrapper(self, *args, **kwargs):
34
  higher_is_better_dict = func(self, *args, **kwargs)
35
- higher_is_better_dict["latency"] = False
 
 
36
  return higher_is_better_dict
37
  return wrapper
38
 
 
9
  # We process the results here
10
  processed_results = [r[0] for r in results]
11
 
12
+ # end_to_end_time = end_to_end_time / batch_size
13
+ # prefilling_time = prefilling_time / batch_size
14
+ # token_per_sec = output_length / (decoding_time / batch_size)
15
+
16
+ end_to_end_time = sum([r[1] for r in results]) / len(results)
17
+ prefilling_time = sum([r[2] for r in results]) / len(results)
18
+ token_per_sec = sum([r[3] for r in results]) / len(results)
19
+ print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, token_per_sec: {token_per_sec}")
20
 
21
  # Now call the original process_results with the processed results
22
  result_dict = func(self, doc, processed_results, *args, **kwargs)
23
+ result_dict["end_to_end_time"] = end_to_end_time
24
+ result_dict["prefilling_time"] = prefilling_time
25
+ result_dict["token_per_sec"] = token_per_sec
26
  return result_dict
27
  return wrapper
28
 
 
31
  @functools.wraps(func)
32
  def wrapper(self, *args, **kwargs):
33
  aggregation_list = func(self, *args, **kwargs)
34
+ aggregation_list["end_to_end_time"] = mean
35
+ aggregation_list["prefilling_time"] = mean
36
+ aggregation_list["token_per_sec"] = mean
37
  return aggregation_list
38
  return wrapper
39
 
 
42
  @functools.wraps(func)
43
  def wrapper(self, *args, **kwargs):
44
  higher_is_better_dict = func(self, *args, **kwargs)
45
+ higher_is_better_dict["end_to_end_time"] = False
46
+ higher_is_better_dict["prefilling_time"] = False
47
+ higher_is_better_dict["token_per_sec"] = True
48
  return higher_is_better_dict
49
  return wrapper
50