AppleSwing commited on
Commit
9ffef81
1 Parent(s): f38163c

Fix some bugs

Browse files
backend-cli.py CHANGED
@@ -17,7 +17,7 @@ from src.backend.manage_requests import EvalRequest
17
  from src.leaderboard.read_evals import EvalResult
18
 
19
  from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
20
- from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus
21
 
22
  from src.leaderboard.read_evals import get_raw_eval_results
23
 
@@ -142,9 +142,6 @@ def request_to_result_name(request: EvalRequest) -> str:
142
  def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
143
  batch_size = 1
144
  batch_size = eval_request.batch_size
145
-
146
- if args.debug:
147
- RESULTS_REPO = DEBUG_RESULTS_REPO
148
 
149
  init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
150
  # if init_gpu_info['Mem(M)'] > 500:
@@ -388,21 +385,7 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
388
 
389
  return False
390
 
391
-
392
- def get_gpu_details():
393
- gpus = GPUtil.getGPUs()
394
- gpu = gpus[0]
395
- name = gpu.name.replace(" ", "-")
396
- # Convert memory from MB to GB and round to nearest whole number
397
- memory_gb = round(gpu.memoryTotal / 1024)
398
- memory = f"{memory_gb}GB"
399
- formatted_name = f"{name}-{memory}"
400
- return formatted_name
401
-
402
  def process_pending_requests() -> bool:
403
- if args.debug:
404
- QUEUE_REPO = DEBUG_QUEUE_REPO
405
-
406
  sanity_checks()
407
  print("Processing pending requests")
408
  current_pending_status = [PENDING_STATUS]
@@ -472,6 +455,7 @@ def get_args():
472
  parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
473
  parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
474
  help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
 
475
  return parser.parse_args()
476
 
477
 
@@ -479,7 +463,7 @@ if __name__ == "__main__":
479
  args = get_args()
480
  local_debug = args.debug
481
  # debug specific task by ping
482
- if local_debug:
483
  # debug_model_names = [args.model] # Use model from arguments
484
  # debug_task_name = [args.task] # Use task from arguments
485
  debug_model_names = args.model.split(",")
@@ -510,25 +494,43 @@ if __name__ == "__main__":
510
  results = process_evaluation(task, eval_request, limit=args.limit)
511
  # except Exception as e:
512
  # print(f"debug running error: {e}")
513
- else:
 
 
514
  while True:
515
  res = False
516
-
517
  # if random.randint(0, 10) == 0:
518
  res = process_pending_requests()
519
  print(f"waiting for 60 seconds")
520
  time.sleep(60)
521
-
522
  # if res is False:
523
  # if random.randint(0, 5) == 0:
524
  # res = maybe_refresh_results(100)
525
  # else:
526
  # res = process_finished_requests(100)
527
-
528
  # time.sleep(60)
529
-
530
  # if res is False:
531
  # if random.randint(0, 5) == 0:
532
  # res = maybe_refresh_results(0)
533
  # else:
534
  # res = process_finished_requests(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  from src.leaderboard.read_evals import EvalResult
18
 
19
  from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
20
+ from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus, get_gpu_details
21
 
22
  from src.leaderboard.read_evals import get_raw_eval_results
23
 
 
142
  def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
143
  batch_size = 1
144
  batch_size = eval_request.batch_size
 
 
 
145
 
146
  init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
147
  # if init_gpu_info['Mem(M)'] > 500:
 
385
 
386
  return False
387
 
 
 
 
 
 
 
 
 
 
 
 
388
  def process_pending_requests() -> bool:
 
 
 
389
  sanity_checks()
390
  print("Processing pending requests")
391
  current_pending_status = [PENDING_STATUS]
 
455
  parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
456
  parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
457
  help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
458
+ parser.add_argument("--debug_repo", action="store_true", help="Use debug repo")
459
  return parser.parse_args()
460
 
461
 
 
463
  args = get_args()
464
  local_debug = args.debug
465
  # debug specific task by ping
466
+ if local_debug and not args.debug_repo:
467
  # debug_model_names = [args.model] # Use model from arguments
468
  # debug_task_name = [args.task] # Use task from arguments
469
  debug_model_names = args.model.split(",")
 
494
  results = process_evaluation(task, eval_request, limit=args.limit)
495
  # except Exception as e:
496
  # print(f"debug running error: {e}")
497
+ elif local_debug and args.debug_repo:
498
+ QUEUE_REPO = DEBUG_QUEUE_REPO
499
+ RESULTS_REPO = DEBUG_RESULTS_REPO
500
  while True:
501
  res = False
 
502
  # if random.randint(0, 10) == 0:
503
  res = process_pending_requests()
504
  print(f"waiting for 60 seconds")
505
  time.sleep(60)
 
506
  # if res is False:
507
  # if random.randint(0, 5) == 0:
508
  # res = maybe_refresh_results(100)
509
  # else:
510
  # res = process_finished_requests(100)
 
511
  # time.sleep(60)
 
512
  # if res is False:
513
  # if random.randint(0, 5) == 0:
514
  # res = maybe_refresh_results(0)
515
  # else:
516
  # res = process_finished_requests(0)
517
+ elif not local_debug and not args.debug_repo:
518
+ while True:
519
+ res = False
520
+ # if random.randint(0, 10) == 0:
521
+ res = process_pending_requests()
522
+ print(f"waiting for 60 seconds")
523
+ time.sleep(60)
524
+ # if res is False:
525
+ # if random.randint(0, 5) == 0:
526
+ # res = maybe_refresh_results(100)
527
+ # else:
528
+ # res = process_finished_requests(100)
529
+ # time.sleep(60)
530
+ # if res is False:
531
+ # if random.randint(0, 5) == 0:
532
+ # res = maybe_refresh_results(0)
533
+ # else:
534
+ # res = process_finished_requests(0)
535
+ else:
536
+ raise Exception("Cannot use debug_repo without local debug flag")
src/backend/envs.py CHANGED
@@ -57,7 +57,7 @@ class Tasks(Enum):
57
 
58
  # task20 = Task("race", "acc", "RACE", 0)
59
  task21 = Task("mmlu", "acc", "MMLU", 5)
60
- task22 = Task("gsm8k", "exact_match", "GSM8K", 5)
61
 
62
 
63
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 
57
 
58
  # task20 = Task("race", "acc", "RACE", 0)
59
  task21 = Task("mmlu", "acc", "MMLU", 5)
60
+ task22 = Task("gsm8k", "em", "GSM8K", 5)
61
 
62
 
63
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
src/display/utils.py CHANGED
@@ -75,6 +75,7 @@ class Tasks(Enum):
75
  # # XXX include me back at some point
76
  selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
77
  mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
 
78
 
79
 
80
  # These classes are for user facing column names,
 
75
  # # XXX include me back at some point
76
  selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
77
  mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
78
+ gsm8k = Task("gsm8k", "em", "GSM8K") #GSM8K/EM (5-shot)
79
 
80
 
81
  # These classes are for user facing column names,
src/submission/check_validity.py CHANGED
@@ -130,7 +130,8 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
130
  continue
131
  with open(os.path.join(root, file), "r") as f:
132
  info = json.load(f)
133
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
 
134
 
135
  # Select organisation
136
  if info["model"].count("/") == 0 or "submitted_time" not in info:
 
130
  continue
131
  with open(os.path.join(root, file), "r") as f:
132
  info = json.load(f)
133
+ if not info["status"] == "FINISHED" and not info["status"] == "RUNNING":
134
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
135
 
136
  # Select organisation
137
  if info["model"].count("/") == 0 or "submitted_time" not in info:
src/utils.py CHANGED
@@ -3,12 +3,48 @@ from huggingface_hub import snapshot_download
3
  import subprocess
4
  import re
5
  import os
 
6
 
7
  try:
8
  from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
9
  except:
10
  print("local debug: from display.utils")
11
  from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
14
  for i in range(10):
@@ -56,7 +92,7 @@ def parse_nvidia_smi():
56
  gpu_stats = []
57
 
58
  gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
59
- gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+?\d+GB)')
60
 
61
  gpu_name = ""
62
  for index in gpu_indices:
@@ -131,5 +167,70 @@ def analyze_gpu_stats(stats_list):
131
 
132
  return avg_stats
133
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  if __name__ == "__main__":
135
  print(analyze_gpu_stats(parse_nvidia_smi()))
 
3
  import subprocess
4
  import re
5
  import os
6
+ import GPUtil
7
 
8
  try:
9
  from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
10
  except:
11
  print("local debug: from display.utils")
12
  from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
13
+
14
+ MEM_BW_DICT ={
15
+ "NVIDIA-A100-PCIe-80GB": 1935,
16
+ "NVIDIA-A100-SXM-80GB": 2039,
17
+ "NVIDIA-H100-PCIe-80GB": 2039,
18
+ "NVIDIA-RTX-A5000-24GB": 768
19
+ }
20
+
21
+ PEAK_FLOPS_DICT = {
22
+ "float32":{
23
+ "NVIDIA-A100-PCIe-80GB": 312e12,
24
+ "NVIDIA-A100-SXM-80GB": 312e12,
25
+ "NVIDIA-H100-PCIe-80GB": 756e12,
26
+ "NVIDIA-RTX-A5000-24GB": 222.2e12
27
+ },
28
+ "float16":{
29
+ "NVIDIA-A100-PCIe-80GB": 624e12,
30
+ "NVIDIA-A100-SXM-80GB": 624e12,
31
+ "NVIDIA-H100-PCIe-80GB": 1513e12,
32
+ "NVIDIA-RTX-A5000-24GB": 444.4e12
33
+ },
34
+ "8bit":{
35
+ "NVIDIA-A100-PCIe-80GB": 1248e12,
36
+ "NVIDIA-A100-SXM-80GB": 1248e12,
37
+ "NVIDIA-H100-PCIe-80GB": 3026e12,
38
+ "NVIDIA-RTX-A5000-24GB": 889e12
39
+ },
40
+ "4bit": {
41
+ "NVIDIA-A100-PCIe-80GB": 2496e12,
42
+ "NVIDIA-A100-SXM-80GB": 2496e12,
43
+ "NVIDIA-H100-PCIe-80GB": 6052e12,
44
+ "NVIDIA-RTX-A5000-24GB": 1778e12
45
+ }
46
+
47
+ }
48
 
49
  def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
50
  for i in range(10):
 
92
  gpu_stats = []
93
 
94
  gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
95
+ gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
96
 
97
  gpu_name = ""
98
  for index in gpu_indices:
 
167
 
168
  return avg_stats
169
 
170
+ def get_gpu_number():
171
+ visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
172
+ if visible_devices is not None:
173
+ gpu_indices = visible_devices.split(',')
174
+ else:
175
+ # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
176
+ result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
177
+ if result.returncode != 0:
178
+ print("Failed to query GPU indices.")
179
+ return []
180
+ gpu_indices = result.stdout.strip().split('\n')
181
+ # print(f"gpu_indices: {gpu_indices}")
182
+ gpu_stats = []
183
+
184
+ gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
185
+
186
+ for index in gpu_indices:
187
+ result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
188
+ output = result.stdout.strip()
189
+ lines = output.split("\n")
190
+ for line in lines:
191
+ match = gpu_info_pattern.search(line)
192
+ gpu_info = {}
193
+ if match:
194
+ temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
195
+ gpu_info.update({
196
+ GPU_TEMP: temp,
197
+ GPU_Power: power_usage,
198
+ GPU_Mem: round(mem_usage / 1024, 2),
199
+ GPU_Util: gpu_util
200
+ })
201
+
202
+ if len(gpu_info) >= 4:
203
+ gpu_stats.append(gpu_info)
204
+
205
+ return len(gpu_stats)
206
+
207
+ def get_gpu_details():
208
+ gpus = GPUtil.getGPUs()
209
+ gpu = gpus[0]
210
+ name = gpu.name.replace(" ", "-")
211
+ # Convert memory from MB to GB and round to nearest whole number
212
+ memory_gb = round(gpu.memoryTotal / 1024)
213
+ memory = f"{memory_gb}GB"
214
+ formatted_name = f"{name}-{memory}"
215
+ return formatted_name
216
+
217
+ def get_peak_bw(gpu_name):
218
+ return MEM_BW_DICT[gpu_name]
219
+
220
+ def get_peak_flops(gpu_name, precision):
221
+ return PEAK_FLOPS_DICT[precision][gpu_name]
222
+
223
+ def transfer_precision2bytes(precision):
224
+ if precision == "float32":
225
+ return 4
226
+ elif precision == "float16":
227
+ return 2
228
+ elif precision == "8bit":
229
+ return 1
230
+ elif precision == "4bit":
231
+ return 0.5
232
+ else:
233
+ raise ValueError(f"Unsupported precision: {precision}")
234
+
235
  if __name__ == "__main__":
236
  print(analyze_gpu_stats(parse_nvidia_smi()))