future-xy commited on
Commit
d6d7ec6
·
1 Parent(s): 2d754ab

formatting code

Browse files
app.py CHANGED
@@ -19,7 +19,7 @@ from src.display.about import (
19
  LLM_BENCHMARKS_TEXT,
20
  LLM_BENCHMARKS_DETAILS,
21
  FAQ_TEXT,
22
- TITLE
23
  )
24
 
25
  from src.display.css_html_js import custom_css
@@ -35,7 +35,7 @@ from src.display.utils import (
35
  ModelType,
36
  fields,
37
  WeightType,
38
- Precision
39
  )
40
 
41
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
@@ -47,7 +47,9 @@ from src.utils import get_dataset_summary_table
47
  def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
48
  try:
49
  print(local_dir)
50
- snapshot_download(repo_id=repo_id, local_dir=local_dir, repo_type=repo_type, tqdm_class=tqdm_class, etag_timeout=etag_timeout)
 
 
51
  except Exception as e:
52
  restart_space()
53
 
@@ -57,15 +59,21 @@ def restart_space():
57
 
58
 
59
  def init_space():
60
- dataset_df = get_dataset_summary_table(file_path='blog/Hallucination-Leaderboard-Summary.csv')
61
 
62
- if socket.gethostname() not in {'neuromancer'}:
63
  # sync model_type with open-llm-leaderboard
64
- ui_snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
65
- ui_snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 
 
 
 
66
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, "", COLS, BENCHMARK_COLS)
67
 
68
- finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 
 
69
  return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
70
 
71
 
@@ -74,12 +82,9 @@ leaderboard_df = original_df.copy()
74
 
75
 
76
  # Searching and filtering
77
- def update_table(hidden_df: pd.DataFrame,
78
- columns: list,
79
- type_query: list,
80
- precision_query: list,
81
- size_query: list,
82
- query: str):
83
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
84
  filtered_df = filter_queries(query, filtered_df)
85
  df = select_columns(filtered_df, columns)
@@ -99,7 +104,9 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
99
  # We use COLS to maintain sorting
100
  filtered_df = df[
101
  # always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
102
- always_here_cols + [c for c in COLS if c in df.columns and c in columns] + dummy_col
 
 
103
  ]
104
  return filtered_df
105
 
@@ -121,10 +128,7 @@ def filter_queries(query: str, filtered_df: pd.DataFrame):
121
  return filtered_df
122
 
123
 
124
- def filter_models(df: pd.DataFrame,
125
- type_query: list,
126
- size_query: list,
127
- precision_query: list) -> pd.DataFrame:
128
  # Show all models
129
  filtered_df = df
130
 
@@ -152,15 +156,15 @@ with demo:
152
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
153
 
154
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
155
- with gr.TabItem("MOE-LLM-GPU-Poor-Leaderboard Benchmark",
156
- elem_id="llm-benchmark-tab-table",
157
- id=0):
158
  with gr.Row():
159
  with gr.Column():
160
  with gr.Row():
161
- search_bar = gr.Textbox(placeholder=" 🔍 Model search (separate multiple queries with `;`)",
162
- show_label=False,
163
- elem_id="search-bar")
 
 
164
  with gr.Row():
165
  shown_columns = gr.CheckboxGroup(
166
  choices=[
@@ -175,7 +179,8 @@ with demo:
175
  ],
176
  label="Select columns to show",
177
  elem_id="column-select",
178
- interactive=True)
 
179
 
180
  with gr.Column(min_width=320):
181
  filter_columns_type = gr.CheckboxGroup(
@@ -183,40 +188,51 @@ with demo:
183
  choices=[t.to_str() for t in ModelType],
184
  value=[t.to_str() for t in ModelType],
185
  interactive=True,
186
- elem_id="filter-columns-type")
 
187
 
188
  filter_columns_precision = gr.CheckboxGroup(
189
  label="Precision",
190
  choices=[i.value.name for i in Precision],
191
  value=[i.value.name for i in Precision],
192
  interactive=True,
193
- elem_id="filter-columns-precision")
 
194
 
195
  filter_columns_size = gr.CheckboxGroup(
196
  label="Model sizes (in billions of parameters)",
197
  choices=list(NUMERIC_INTERVALS.keys()),
198
  value=list(NUMERIC_INTERVALS.keys()),
199
  interactive=True,
200
- elem_id="filter-columns-size")
 
201
 
202
  # breakpoint()
203
 
204
  leaderboard_table = gr.components.Dataframe(
205
- value=leaderboard_df[
206
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + [AutoEvalColumn.dummy.name]
207
- ] if leaderboard_df.empty is False else leaderboard_df,
 
 
 
 
 
 
208
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
209
  datatype=TYPES,
210
  elem_id="leaderboard-table",
211
  interactive=False,
212
- visible=True) # column_widths=["2%", "20%"]
 
213
 
214
  # Dummy leaderboard for handling the case when the user uses backspace key
215
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
216
  value=original_df[COLS] if original_df.empty is False else original_df,
217
  headers=COLS,
218
  datatype=TYPES,
219
- visible=False)
 
220
 
221
  search_bar.submit(
222
  update_table,
@@ -228,7 +244,8 @@ with demo:
228
  filter_columns_size,
229
  search_bar,
230
  ],
231
- leaderboard_table)
 
232
 
233
  # Check query parameter once at startup and update search bar
234
  demo.load(load_query, inputs=[], outputs=[search_bar])
@@ -245,7 +262,8 @@ with demo:
245
  search_bar,
246
  ],
247
  leaderboard_table,
248
- queue=True)
 
249
 
250
  with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
251
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
@@ -253,11 +271,12 @@ with demo:
253
  dataset_table = gr.components.Dataframe(
254
  value=dataset_df,
255
  headers=list(dataset_df.columns),
256
- datatype=['str', 'markdown', 'str', 'str', 'str'],
257
  elem_id="dataset-table",
258
  interactive=False,
259
  visible=True,
260
- column_widths=["15%", "20%"])
 
261
 
262
  gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
263
  gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
@@ -271,26 +290,20 @@ with demo:
271
  with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
272
  with gr.Row():
273
  finished_eval_table = gr.components.Dataframe(
274
- value=finished_eval_queue_df,
275
- headers=EVAL_COLS,
276
- datatype=EVAL_TYPES,
277
- row_count=5)
278
 
279
  with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
280
  with gr.Row():
281
  running_eval_table = gr.components.Dataframe(
282
- value=running_eval_queue_df,
283
- headers=EVAL_COLS,
284
- datatype=EVAL_TYPES,
285
- row_count=5)
286
 
287
  with gr.Accordion(f"⏳ Scheduled Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
288
  with gr.Row():
289
  pending_eval_table = gr.components.Dataframe(
290
- value=pending_eval_queue_df,
291
- headers=EVAL_COLS,
292
- datatype=EVAL_TYPES,
293
- row_count=5)
294
 
295
  with gr.Row():
296
  gr.Markdown("# Submit your model here", elem_classes="markdown-text")
@@ -305,7 +318,8 @@ with demo:
305
  label="Model type",
306
  multiselect=False,
307
  value=None,
308
- interactive=True)
 
309
 
310
  with gr.Column():
311
  precision = gr.Dropdown(
@@ -313,14 +327,16 @@ with demo:
313
  label="Precision",
314
  multiselect=False,
315
  value="float32",
316
- interactive=True)
 
317
 
318
  weight_type = gr.Dropdown(
319
  choices=[i.value.name for i in WeightType],
320
  label="Weights type",
321
  multiselect=False,
322
  value="Original",
323
- interactive=True)
 
324
 
325
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
326
 
@@ -337,7 +353,8 @@ with demo:
337
  weight_type,
338
  model_type,
339
  ],
340
- submission_result)
 
341
 
342
  with gr.Row():
343
  with gr.Accordion("Citing this leaderboard", open=False):
@@ -346,7 +363,8 @@ with demo:
346
  label=CITATION_BUTTON_LABEL,
347
  lines=20,
348
  elem_id="citation-button",
349
- show_copy_button=True)
 
350
 
351
  scheduler = BackgroundScheduler()
352
 
@@ -356,7 +374,8 @@ scheduler.add_job(restart_space, "interval", seconds=6 * 60 * 60)
356
  def launch_backend():
357
  import subprocess
358
  from src.backend.envs import DEVICE
359
- if DEVICE not in {'cpu'}:
 
360
  _ = subprocess.run(["python", "backend-cli.py"])
361
 
362
 
 
19
  LLM_BENCHMARKS_TEXT,
20
  LLM_BENCHMARKS_DETAILS,
21
  FAQ_TEXT,
22
+ TITLE,
23
  )
24
 
25
  from src.display.css_html_js import custom_css
 
35
  ModelType,
36
  fields,
37
  WeightType,
38
+ Precision,
39
  )
40
 
41
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
 
47
  def ui_snapshot_download(repo_id, local_dir, repo_type, tqdm_class, etag_timeout):
48
  try:
49
  print(local_dir)
50
+ snapshot_download(
51
+ repo_id=repo_id, local_dir=local_dir, repo_type=repo_type, tqdm_class=tqdm_class, etag_timeout=etag_timeout
52
+ )
53
  except Exception as e:
54
  restart_space()
55
 
 
59
 
60
 
61
  def init_space():
62
+ dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
63
 
64
+ if socket.gethostname() not in {"neuromancer"}:
65
  # sync model_type with open-llm-leaderboard
66
+ ui_snapshot_download(
67
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
68
+ )
69
+ ui_snapshot_download(
70
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
71
+ )
72
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, "", COLS, BENCHMARK_COLS)
73
 
74
+ finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
75
+ EVAL_REQUESTS_PATH, EVAL_COLS
76
+ )
77
  return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
78
 
79
 
 
82
 
83
 
84
  # Searching and filtering
85
+ def update_table(
86
+ hidden_df: pd.DataFrame, columns: list, type_query: list, precision_query: list, size_query: list, query: str
87
+ ):
 
 
 
88
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
89
  filtered_df = filter_queries(query, filtered_df)
90
  df = select_columns(filtered_df, columns)
 
104
  # We use COLS to maintain sorting
105
  filtered_df = df[
106
  # always_here_cols + [c for c in COLS if c in df.columns and c in columns] + [AutoEvalColumn.dummy.name]
107
+ always_here_cols
108
+ + [c for c in COLS if c in df.columns and c in columns]
109
+ + dummy_col
110
  ]
111
  return filtered_df
112
 
 
128
  return filtered_df
129
 
130
 
131
+ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precision_query: list) -> pd.DataFrame:
 
 
 
132
  # Show all models
133
  filtered_df = df
134
 
 
156
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
157
 
158
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
159
+ with gr.TabItem("MOE-LLM-GPU-Poor-Leaderboard Benchmark", elem_id="llm-benchmark-tab-table", id=0):
 
 
160
  with gr.Row():
161
  with gr.Column():
162
  with gr.Row():
163
+ search_bar = gr.Textbox(
164
+ placeholder=" 🔍 Model search (separate multiple queries with `;`)",
165
+ show_label=False,
166
+ elem_id="search-bar",
167
+ )
168
  with gr.Row():
169
  shown_columns = gr.CheckboxGroup(
170
  choices=[
 
179
  ],
180
  label="Select columns to show",
181
  elem_id="column-select",
182
+ interactive=True,
183
+ )
184
 
185
  with gr.Column(min_width=320):
186
  filter_columns_type = gr.CheckboxGroup(
 
188
  choices=[t.to_str() for t in ModelType],
189
  value=[t.to_str() for t in ModelType],
190
  interactive=True,
191
+ elem_id="filter-columns-type",
192
+ )
193
 
194
  filter_columns_precision = gr.CheckboxGroup(
195
  label="Precision",
196
  choices=[i.value.name for i in Precision],
197
  value=[i.value.name for i in Precision],
198
  interactive=True,
199
+ elem_id="filter-columns-precision",
200
+ )
201
 
202
  filter_columns_size = gr.CheckboxGroup(
203
  label="Model sizes (in billions of parameters)",
204
  choices=list(NUMERIC_INTERVALS.keys()),
205
  value=list(NUMERIC_INTERVALS.keys()),
206
  interactive=True,
207
+ elem_id="filter-columns-size",
208
+ )
209
 
210
  # breakpoint()
211
 
212
  leaderboard_table = gr.components.Dataframe(
213
+ value=(
214
+ leaderboard_df[
215
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
216
+ + shown_columns.value
217
+ + [AutoEvalColumn.dummy.name]
218
+ ]
219
+ if leaderboard_df.empty is False
220
+ else leaderboard_df
221
+ ),
222
  headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
223
  datatype=TYPES,
224
  elem_id="leaderboard-table",
225
  interactive=False,
226
+ visible=True,
227
+ ) # column_widths=["2%", "20%"]
228
 
229
  # Dummy leaderboard for handling the case when the user uses backspace key
230
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
231
  value=original_df[COLS] if original_df.empty is False else original_df,
232
  headers=COLS,
233
  datatype=TYPES,
234
+ visible=False,
235
+ )
236
 
237
  search_bar.submit(
238
  update_table,
 
244
  filter_columns_size,
245
  search_bar,
246
  ],
247
+ leaderboard_table,
248
+ )
249
 
250
  # Check query parameter once at startup and update search bar
251
  demo.load(load_query, inputs=[], outputs=[search_bar])
 
262
  search_bar,
263
  ],
264
  leaderboard_table,
265
+ queue=True,
266
+ )
267
 
268
  with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
269
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
271
  dataset_table = gr.components.Dataframe(
272
  value=dataset_df,
273
  headers=list(dataset_df.columns),
274
+ datatype=["str", "markdown", "str", "str", "str"],
275
  elem_id="dataset-table",
276
  interactive=False,
277
  visible=True,
278
+ column_widths=["15%", "20%"],
279
+ )
280
 
281
  gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
282
  gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
 
290
  with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
291
  with gr.Row():
292
  finished_eval_table = gr.components.Dataframe(
293
+ value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
294
+ )
 
 
295
 
296
  with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
297
  with gr.Row():
298
  running_eval_table = gr.components.Dataframe(
299
+ value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
300
+ )
 
 
301
 
302
  with gr.Accordion(f"⏳ Scheduled Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
303
  with gr.Row():
304
  pending_eval_table = gr.components.Dataframe(
305
+ value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
306
+ )
 
 
307
 
308
  with gr.Row():
309
  gr.Markdown("# Submit your model here", elem_classes="markdown-text")
 
318
  label="Model type",
319
  multiselect=False,
320
  value=None,
321
+ interactive=True,
322
+ )
323
 
324
  with gr.Column():
325
  precision = gr.Dropdown(
 
327
  label="Precision",
328
  multiselect=False,
329
  value="float32",
330
+ interactive=True,
331
+ )
332
 
333
  weight_type = gr.Dropdown(
334
  choices=[i.value.name for i in WeightType],
335
  label="Weights type",
336
  multiselect=False,
337
  value="Original",
338
+ interactive=True,
339
+ )
340
 
341
  base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
342
 
 
353
  weight_type,
354
  model_type,
355
  ],
356
+ submission_result,
357
+ )
358
 
359
  with gr.Row():
360
  with gr.Accordion("Citing this leaderboard", open=False):
 
363
  label=CITATION_BUTTON_LABEL,
364
  lines=20,
365
  elem_id="citation-button",
366
+ show_copy_button=True,
367
+ )
368
 
369
  scheduler = BackgroundScheduler()
370
 
 
374
  def launch_backend():
375
  import subprocess
376
  from src.backend.envs import DEVICE
377
+
378
+ if DEVICE not in {"cpu"}:
379
  _ = subprocess.run(["python", "backend-cli.py"])
380
 
381
 
backend-cli.py CHANGED
@@ -32,7 +32,9 @@ import pprint
32
  def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
33
  for i in range(10):
34
  try:
35
- set_eval_request(api=api, eval_request=eval_request, set_to_status=set_to_status, hf_repo=hf_repo, local_dir=local_dir)
 
 
36
  return
37
  except Exception as e:
38
  print(f"Error setting eval request to {set_to_status}: {e}. Retrying in 60 seconds")
@@ -53,19 +55,32 @@ FAILED_STATUS = "FAILED"
53
  TASKS_HARNESS = [task.value for task in Tasks]
54
 
55
 
56
- my_snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
57
- my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
 
 
 
 
58
 
59
 
60
  def sanity_checks():
61
- print(f'Device: {DEVICE}')
62
 
63
  # pull the eval dataset from the hub and parse any eval requests
64
  # check completed evals and set them to finished
65
- my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
66
- check_completed_evals(api=API, checked_status=RUNNING_STATUS, completed_status=FINISHED_STATUS,
67
- failed_status=FAILED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND,
68
- hf_repo_results=RESULTS_REPO, local_dir_results=EVAL_RESULTS_PATH_BACKEND)
 
 
 
 
 
 
 
 
 
69
  return
70
 
71
 
@@ -97,29 +112,51 @@ def request_to_result_name(request: EvalRequest) -> str:
97
  def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
98
  batch_size = 2
99
  try:
100
- results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
101
- batch_size=batch_size, device=DEVICE, use_cache=None, limit=LIMIT)
 
 
 
 
 
 
 
102
  except RuntimeError as e:
103
  if "No executable batch size found" in str(e):
104
  batch_size = 1
105
- results = run_evaluation(eval_request=eval_request, task_names=[task.benchmark], num_fewshot=task.num_fewshot,
106
- batch_size=batch_size, device=DEVICE, use_cache=None, limit=LIMIT)
 
 
 
 
 
 
 
107
  else:
108
  raise
109
 
110
- print('RESULTS', results)
111
 
112
- dumped = json.dumps(results, indent=2, default=lambda o: '<not serializable>')
113
  print(dumped)
114
 
115
- output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
 
 
116
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
117
  with open(output_path, "w") as f:
118
  f.write(dumped)
119
 
120
- my_snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
121
- API.upload_file(path_or_fileobj=output_path, path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
122
- repo_id=RESULTS_REPO, repo_type="dataset")
 
 
 
 
 
 
123
  return results
124
 
125
 
@@ -129,7 +166,9 @@ def process_finished_requests(thr: int, hard_task_lst: Optional[list[str]] = Non
129
  current_finished_status = [FINISHED_STATUS, FAILED_STATUS]
130
 
131
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
132
- eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
133
  # Sort the evals by priority (first submitted, first run)
134
  eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
135
 
@@ -145,7 +184,9 @@ def process_finished_requests(thr: int, hard_task_lst: Optional[list[str]] = Non
145
  result_name: str = request_to_result_name(eval_request)
146
 
147
  # Check the corresponding result
148
- eval_result: Optional[EvalResult] = result_name_to_result[result_name] if result_name in result_name_to_result else None
 
 
149
 
150
  # breakpoint()
151
 
@@ -163,13 +204,37 @@ def process_finished_requests(thr: int, hard_task_lst: Optional[list[str]] = Non
163
  if (eval_result is None or task_name not in eval_result.results) and do_run_task:
164
  eval_request: EvalRequest = result_name_to_request[result_name]
165
 
166
- my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
167
- my_set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  results = process_evaluation(task, eval_request)
170
 
171
- my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
172
- my_set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  return True
175
 
@@ -182,7 +247,9 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
182
  current_finished_status = [PENDING_STATUS, FINISHED_STATUS, FAILED_STATUS]
183
 
184
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
185
- eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
186
  # Sort the evals by priority (first submitted, first run)
187
  eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
188
 
@@ -198,7 +265,9 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
198
  result_name: str = request_to_result_name(eval_request)
199
 
200
  # Check the corresponding result
201
- eval_result: Optional[EvalResult] = result_name_to_result[result_name] if result_name in result_name_to_result else None
 
 
202
 
203
  task_lst = TASKS_HARNESS.copy()
204
  random.shuffle(task_lst)
@@ -211,18 +280,46 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
211
  if hard_task_lst is None or any(ss in task_name for ss in hard_task_lst):
212
  do_run_task = True
213
 
214
- task_lst = ['nq', 'trivia', 'tqa', 'self']
215
- if (eval_result is None or do_run_task or task_name not in eval_result.results or
216
- any(ss in task_name for ss in task_lst)):
 
 
 
 
217
  eval_request: EvalRequest = result_name_to_request[result_name]
218
 
219
- my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
220
- my_set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  results = process_evaluation(task, eval_request)
223
 
224
- my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
225
- my_set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
  return True
228
 
@@ -235,7 +332,9 @@ def process_pending_requests() -> bool:
235
  current_pending_status = [PENDING_STATUS]
236
 
237
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
238
- eval_requests = get_eval_requests(job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
239
  # Sort the evals by priority (first submitted, first run)
240
  eval_requests = sort_models_by_priority(api=API, models=eval_requests)
241
 
@@ -249,8 +348,16 @@ def process_pending_requests() -> bool:
249
  eval_request = eval_requests[0]
250
  pp.pprint(eval_request)
251
 
252
- my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
253
- my_set_eval_request(api=API, eval_request=eval_request, set_to_status=RUNNING_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
 
 
 
 
 
 
254
 
255
  task_lst = TASKS_HARNESS.copy()
256
  random.shuffle(task_lst)
@@ -258,34 +365,44 @@ def process_pending_requests() -> bool:
258
  for task in task_lst:
259
  results = process_evaluation(task, eval_request)
260
 
261
- my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
262
- my_set_eval_request(api=API, eval_request=eval_request, set_to_status=FINISHED_STATUS, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
 
 
 
 
 
 
263
 
264
  return True
265
 
266
 
267
  def get_args():
268
- parser = argparse.ArgumentParser(description='Run the backend')
269
- parser.add_argument('--debug', action='store_true', help='Run in debug mode')
270
  return parser.parse_args()
271
 
272
 
273
  if __name__ == "__main__":
274
  args = get_args()
275
  local_debug = args.debug
276
- #debug specific task by ping
277
  if local_debug:
278
- debug_model_names = ['mistralai/Mixtral-8x7B-Instruct-v0.1']
279
  # debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
280
  # debug_task_name = 'ifeval'
281
- debug_task_name = 'mmlu'
282
  task_lst = TASKS_HARNESS.copy()
283
  for task in task_lst:
284
  for debug_model_name in debug_model_names:
285
  task_name = task.benchmark
286
  if task_name != debug_task_name:
287
  continue
288
- eval_request = EvalRequest(model=debug_model_name, private=False, status='', json_filepath='', precision='float16')
 
 
289
  results = process_evaluation(task, eval_request)
290
 
291
  while True:
 
32
  def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
33
  for i in range(10):
34
  try:
35
+ set_eval_request(
36
+ api=api, eval_request=eval_request, set_to_status=set_to_status, hf_repo=hf_repo, local_dir=local_dir
37
+ )
38
  return
39
  except Exception as e:
40
  print(f"Error setting eval request to {set_to_status}: {e}. Retrying in 60 seconds")
 
55
  TASKS_HARNESS = [task.value for task in Tasks]
56
 
57
 
58
+ my_snapshot_download(
59
+ repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
60
+ )
61
+ my_snapshot_download(
62
+ repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
63
+ )
64
 
65
 
66
  def sanity_checks():
67
+ print(f"Device: {DEVICE}")
68
 
69
  # pull the eval dataset from the hub and parse any eval requests
70
  # check completed evals and set them to finished
71
+ my_snapshot_download(
72
+ repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
73
+ )
74
+ check_completed_evals(
75
+ api=API,
76
+ checked_status=RUNNING_STATUS,
77
+ completed_status=FINISHED_STATUS,
78
+ failed_status=FAILED_STATUS,
79
+ hf_repo=QUEUE_REPO,
80
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
81
+ hf_repo_results=RESULTS_REPO,
82
+ local_dir_results=EVAL_RESULTS_PATH_BACKEND,
83
+ )
84
  return
85
 
86
 
 
112
  def process_evaluation(task: Task, eval_request: EvalRequest) -> dict:
113
  batch_size = 2
114
  try:
115
+ results = run_evaluation(
116
+ eval_request=eval_request,
117
+ task_names=[task.benchmark],
118
+ num_fewshot=task.num_fewshot,
119
+ batch_size=batch_size,
120
+ device=DEVICE,
121
+ use_cache=None,
122
+ limit=LIMIT,
123
+ )
124
  except RuntimeError as e:
125
  if "No executable batch size found" in str(e):
126
  batch_size = 1
127
+ results = run_evaluation(
128
+ eval_request=eval_request,
129
+ task_names=[task.benchmark],
130
+ num_fewshot=task.num_fewshot,
131
+ batch_size=batch_size,
132
+ device=DEVICE,
133
+ use_cache=None,
134
+ limit=LIMIT,
135
+ )
136
  else:
137
  raise
138
 
139
+ print("RESULTS", results)
140
 
141
+ dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
142
  print(dumped)
143
 
144
+ output_path = os.path.join(
145
+ EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json"
146
+ )
147
  os.makedirs(os.path.dirname(output_path), exist_ok=True)
148
  with open(output_path, "w") as f:
149
  f.write(dumped)
150
 
151
+ my_snapshot_download(
152
+ repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
153
+ )
154
+ API.upload_file(
155
+ path_or_fileobj=output_path,
156
+ path_in_repo=f"{eval_request.model}/results_{datetime.now()}.json",
157
+ repo_id=RESULTS_REPO,
158
+ repo_type="dataset",
159
+ )
160
  return results
161
 
162
 
 
166
  current_finished_status = [FINISHED_STATUS, FAILED_STATUS]
167
 
168
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
169
+ eval_requests: list[EvalRequest] = get_eval_requests(
170
+ job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
171
+ )
172
  # Sort the evals by priority (first submitted, first run)
173
  eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
174
 
 
184
  result_name: str = request_to_result_name(eval_request)
185
 
186
  # Check the corresponding result
187
+ eval_result: Optional[EvalResult] = (
188
+ result_name_to_result[result_name] if result_name in result_name_to_result else None
189
+ )
190
 
191
  # breakpoint()
192
 
 
204
  if (eval_result is None or task_name not in eval_result.results) and do_run_task:
205
  eval_request: EvalRequest = result_name_to_request[result_name]
206
 
207
+ my_snapshot_download(
208
+ repo_id=QUEUE_REPO,
209
+ revision="main",
210
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
211
+ repo_type="dataset",
212
+ max_workers=60,
213
+ )
214
+ my_set_eval_request(
215
+ api=API,
216
+ eval_request=eval_request,
217
+ set_to_status=RUNNING_STATUS,
218
+ hf_repo=QUEUE_REPO,
219
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
220
+ )
221
 
222
  results = process_evaluation(task, eval_request)
223
 
224
+ my_snapshot_download(
225
+ repo_id=QUEUE_REPO,
226
+ revision="main",
227
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
228
+ repo_type="dataset",
229
+ max_workers=60,
230
+ )
231
+ my_set_eval_request(
232
+ api=API,
233
+ eval_request=eval_request,
234
+ set_to_status=FINISHED_STATUS,
235
+ hf_repo=QUEUE_REPO,
236
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
237
+ )
238
 
239
  return True
240
 
 
247
  current_finished_status = [PENDING_STATUS, FINISHED_STATUS, FAILED_STATUS]
248
 
249
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
250
+ eval_requests: list[EvalRequest] = get_eval_requests(
251
+ job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
252
+ )
253
  # Sort the evals by priority (first submitted, first run)
254
  eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
255
 
 
265
  result_name: str = request_to_result_name(eval_request)
266
 
267
  # Check the corresponding result
268
+ eval_result: Optional[EvalResult] = (
269
+ result_name_to_result[result_name] if result_name in result_name_to_result else None
270
+ )
271
 
272
  task_lst = TASKS_HARNESS.copy()
273
  random.shuffle(task_lst)
 
280
  if hard_task_lst is None or any(ss in task_name for ss in hard_task_lst):
281
  do_run_task = True
282
 
283
+ task_lst = ["nq", "trivia", "tqa", "self"]
284
+ if (
285
+ eval_result is None
286
+ or do_run_task
287
+ or task_name not in eval_result.results
288
+ or any(ss in task_name for ss in task_lst)
289
+ ):
290
  eval_request: EvalRequest = result_name_to_request[result_name]
291
 
292
+ my_snapshot_download(
293
+ repo_id=QUEUE_REPO,
294
+ revision="main",
295
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
296
+ repo_type="dataset",
297
+ max_workers=60,
298
+ )
299
+ my_set_eval_request(
300
+ api=API,
301
+ eval_request=eval_request,
302
+ set_to_status=RUNNING_STATUS,
303
+ hf_repo=QUEUE_REPO,
304
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
305
+ )
306
 
307
  results = process_evaluation(task, eval_request)
308
 
309
+ my_snapshot_download(
310
+ repo_id=QUEUE_REPO,
311
+ revision="main",
312
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
313
+ repo_type="dataset",
314
+ max_workers=60,
315
+ )
316
+ my_set_eval_request(
317
+ api=API,
318
+ eval_request=eval_request,
319
+ set_to_status=FINISHED_STATUS,
320
+ hf_repo=QUEUE_REPO,
321
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
322
+ )
323
 
324
  return True
325
 
 
332
  current_pending_status = [PENDING_STATUS]
333
 
334
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
335
+ eval_requests = get_eval_requests(
336
+ job_status=current_pending_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
337
+ )
338
  # Sort the evals by priority (first submitted, first run)
339
  eval_requests = sort_models_by_priority(api=API, models=eval_requests)
340
 
 
348
  eval_request = eval_requests[0]
349
  pp.pprint(eval_request)
350
 
351
+ my_snapshot_download(
352
+ repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
353
+ )
354
+ my_set_eval_request(
355
+ api=API,
356
+ eval_request=eval_request,
357
+ set_to_status=RUNNING_STATUS,
358
+ hf_repo=QUEUE_REPO,
359
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
360
+ )
361
 
362
  task_lst = TASKS_HARNESS.copy()
363
  random.shuffle(task_lst)
 
365
  for task in task_lst:
366
  results = process_evaluation(task, eval_request)
367
 
368
+ my_snapshot_download(
369
+ repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
370
+ )
371
+ my_set_eval_request(
372
+ api=API,
373
+ eval_request=eval_request,
374
+ set_to_status=FINISHED_STATUS,
375
+ hf_repo=QUEUE_REPO,
376
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
377
+ )
378
 
379
  return True
380
 
381
 
382
  def get_args():
383
+ parser = argparse.ArgumentParser(description="Run the backend")
384
+ parser.add_argument("--debug", action="store_true", help="Run in debug mode")
385
  return parser.parse_args()
386
 
387
 
388
  if __name__ == "__main__":
389
  args = get_args()
390
  local_debug = args.debug
391
+ # debug specific task by ping
392
  if local_debug:
393
+ debug_model_names = ["mistralai/Mixtral-8x7B-Instruct-v0.1"]
394
  # debug_model_names = ["TheBloke/Mixtral-8x7B-v0.1-GPTQ"]
395
  # debug_task_name = 'ifeval'
396
+ debug_task_name = "mmlu"
397
  task_lst = TASKS_HARNESS.copy()
398
  for task in task_lst:
399
  for debug_model_name in debug_model_names:
400
  task_name = task.benchmark
401
  if task_name != debug_task_name:
402
  continue
403
+ eval_request = EvalRequest(
404
+ model=debug_model_name, private=False, status="", json_filepath="", precision="float16"
405
+ )
406
  results = process_evaluation(task, eval_request)
407
 
408
  while True:
cli/analysis-cli.py CHANGED
@@ -77,19 +77,19 @@ def sanitise_dataset(name: str) -> str:
77
  return res
78
 
79
 
80
- cache_file = 'data_map_cache.pkl'
81
 
82
 
83
  def load_data_map_from_cache(cache_file):
84
  if os.path.exists(cache_file):
85
- with open(cache_file, 'rb') as f:
86
  return pickle.load(f)
87
  else:
88
  return None
89
 
90
 
91
  def save_data_map_to_cache(data_map, cache_file):
92
- with open(cache_file, 'wb') as f:
93
  pickle.dump(data_map, f)
94
 
95
 
@@ -98,8 +98,12 @@ data_map = load_data_map_from_cache(cache_file)
98
 
99
 
100
  if data_map is None:
101
- my_snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
102
- my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
 
 
 
 
103
 
104
  result_path_lst = find_json_files(EVAL_RESULTS_PATH_BACKEND)
105
  request_path_lst = find_json_files(EVAL_REQUESTS_PATH_BACKEND)
@@ -107,7 +111,7 @@ if data_map is None:
107
  model_name_to_model_map = {}
108
 
109
  for path in request_path_lst:
110
- with open(path, 'r') as f:
111
  data = json.load(f)
112
  model_name_to_model_map[data["model"]] = data
113
 
@@ -117,7 +121,7 @@ if data_map is None:
117
  data_map = {}
118
 
119
  for path in result_path_lst:
120
- with open(path, 'r') as f:
121
  data = json.load(f)
122
  model_name = data["config"]["model_name"]
123
  for dataset_name, results_dict in data["results"].items():
@@ -127,42 +131,42 @@ if data_map is None:
127
 
128
  to_add = True
129
 
130
- if 'f1' in metric_name:
131
  to_add = False
132
 
133
- if 'stderr' in metric_name:
134
  to_add = False
135
 
136
- if 'memo-trap_v2' in dataset_name:
137
  to_add = False
138
 
139
- if 'faithdial' in dataset_name:
140
  to_add = False
141
 
142
- if 'truthfulqa_gen' in dataset_name:
143
  to_add = False
144
 
145
- if 'bertscore' in metric_name:
146
- if 'precision' not in metric_name:
147
  to_add = False
148
 
149
- if 'halueval' in dataset_name:
150
- if 'acc' not in metric_name:
151
  to_add = False
152
 
153
- if 'ifeval' in dataset_name:
154
- if 'prompt_level_strict_acc' not in metric_name:
155
  to_add = False
156
 
157
- if 'squad' in dataset_name:
158
  # to_add = False
159
- if 'best_exact' in metric_name:
160
  to_add = False
161
 
162
- if 'fever' in dataset_name:
163
  to_add = False
164
 
165
- if ('xsum' in dataset_name or 'cnn' in dataset_name) and 'v2' not in dataset_name:
166
  to_add = False
167
 
168
  if isinstance(value, str):
@@ -172,25 +176,36 @@ if data_map is None:
172
  to_add = False
173
 
174
  if to_add:
175
- if 'rouge' in metric_name:
176
  value /= 100.0
177
 
178
- if 'squad' in dataset_name:
179
  value /= 100.0
180
 
181
  sanitised_metric_name = metric_name
182
  if "," in sanitised_metric_name:
183
- sanitised_metric_name = sanitised_metric_name.split(',')[0]
184
  sanitised_metric_name = sanitise_metric(sanitised_metric_name)
185
  sanitised_dataset_name = sanitise_dataset(dataset_name)
186
 
187
- model_dataset_metric_to_result_map[(model_name, sanitised_dataset_name, sanitised_metric_name)] = value
 
 
188
 
189
  if model_name not in data_map:
190
  data_map[model_name] = {}
191
  data_map[model_name][(sanitised_dataset_name, sanitised_metric_name)] = value
192
 
193
- print('model_name', model_name, 'dataset_name', sanitised_dataset_name, 'metric_name', sanitised_metric_name, 'value', value)
 
 
 
 
 
 
 
 
 
194
 
195
  save_data_map_to_cache(data_map, cache_file)
196
 
@@ -202,7 +217,7 @@ for model_name in model_name_lst:
202
  if len(data_map[model_name]) < nb_max_metrics - 5:
203
  del data_map[model_name]
204
 
205
- plot_type_lst = ['all', 'summ', 'qa', 'instr', 'detect', 'rc']
206
 
207
  for plot_type in plot_type_lst:
208
 
@@ -212,39 +227,39 @@ for plot_type in plot_type_lst:
212
  if dataset_metric not in data_map_v2:
213
  data_map_v2[dataset_metric] = {}
214
 
215
- if plot_type in {'all'}:
216
  to_add = True
217
- if 'ROUGE' in dataset_metric[1] and 'ROUGE-L' not in dataset_metric[1]:
218
  to_add = False
219
- if 'SQuAD' in dataset_metric[0] and 'EM' not in dataset_metric[1]:
220
  to_add = False
221
- if 'SelfCheckGPT' in dataset_metric[0] and 'MAX' not in dataset_metric[1]:
222
  to_add = False
223
- if '64-shot' in dataset_metric[0]:
224
  to_add = False
225
  if to_add is True:
226
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
227
- elif plot_type in {'summ'}:
228
- if 'CNN' in dataset_metric[0] or 'XSum' in dataset_metric[0]:
229
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
230
- elif plot_type in {'qa'}:
231
- if 'TriviaQA' in dataset_metric[0] or 'NQ' in dataset_metric[0] or 'TruthfulQA' in dataset_metric[0]:
232
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
233
- elif plot_type in {'instr'}:
234
- if 'MemoTrap' in dataset_metric[0] or 'IFEval' in dataset_metric[0]:
235
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
236
- elif plot_type in {'detect'}:
237
- if 'HaluEval' in dataset_metric[0] or 'SelfCheck' in dataset_metric[0]:
238
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
239
- elif plot_type in {'rc'}:
240
- if 'RACE' in dataset_metric[0] or 'SQuAD' in dataset_metric[0]:
241
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
242
  else:
243
  assert False, f"Unknown plot type: {plot_type}"
244
 
245
  # df = pd.DataFrame.from_dict(data_map, orient='index') # Invert the y-axis (rows)
246
- df = pd.DataFrame.from_dict(data_map_v2, orient='index') # Invert the y-axis (rows)
247
- df.index = [', '.join(map(str, idx)) for idx in df.index]
248
 
249
  o_df = df.copy(deep=True)
250
 
@@ -263,7 +278,7 @@ for plot_type in plot_type_lst:
263
 
264
  # Calculate dimensions based on the DataFrame size
265
  cell_height = 1.0 # Height of each cell in inches
266
- cell_width = 1.0 # Width of each cell in inches
267
 
268
  n_rows = len(df.index) # Datasets and Metrics
269
  n_cols = len(df.columns) # Models
@@ -277,60 +292,62 @@ for plot_type in plot_type_lst:
277
 
278
  sns.set_context("notebook", font_scale=1.3)
279
 
280
- dendrogram_ratio = (.1, .1)
281
 
282
- if plot_type in {'detect'}:
283
  fig_width = cell_width * n_cols - 2
284
  fig_height = cell_height * n_rows + 5.2
285
- dendrogram_ratio = (.1, .2)
286
 
287
- if plot_type in {'instr'}:
288
  fig_width = cell_width * n_cols - 2
289
  fig_height = cell_height * n_rows + 5.2
290
- dendrogram_ratio = (.1, .4)
291
 
292
- if plot_type in {'qa'}:
293
  fig_width = cell_width * n_cols - 2
294
  fig_height = cell_height * n_rows + 4
295
- dendrogram_ratio = (.1, .2)
296
 
297
- if plot_type in {'summ'}:
298
  fig_width = cell_width * n_cols - 2
299
  fig_height = cell_height * n_rows + 2.0
300
- dendrogram_ratio = (.1, .1)
301
  row_cluster = False
302
 
303
- if plot_type in {'rc'}:
304
  fig_width = cell_width * n_cols - 2
305
  fig_height = cell_height * n_rows + 5.2
306
- dendrogram_ratio = (.1, .4)
307
 
308
- print('figsize', (fig_width, fig_height))
309
 
310
- o_df.to_json(f'plots/clustermap_{plot_type}.json', orient='split')
311
 
312
- print(f'Generating the clustermaps for {plot_type}')
313
 
314
- for cmap in [None, 'coolwarm', 'viridis']:
315
- fig = sns.clustermap(df,
316
- method='ward',
317
- metric='euclidean',
318
- cmap=cmap,
319
- figsize=(fig_width, fig_height), # figsize=(24, 16),
320
- annot=True,
321
- mask=o_df.isnull(),
322
- dendrogram_ratio=dendrogram_ratio,
323
- fmt='.2f',
324
- col_cluster=col_cluster,
325
- row_cluster=row_cluster)
 
 
326
 
327
  # Adjust the size of the cells (less wide)
328
  plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
329
  plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
330
 
331
- cmap_suffix = '' if cmap is None else f'_{cmap}'
332
 
333
  # Save the clustermap to file
334
- fig.savefig(f'blog/figures/clustermap_{plot_type}{cmap_suffix}.pdf')
335
- fig.savefig(f'blog/figures/clustermap_{plot_type}{cmap_suffix}.png')
336
- fig.savefig(f'blog/figures/clustermap_{plot_type}{cmap_suffix}_t.png', transparent=True, facecolor="none")
 
77
  return res
78
 
79
 
80
+ cache_file = "data_map_cache.pkl"
81
 
82
 
83
  def load_data_map_from_cache(cache_file):
84
  if os.path.exists(cache_file):
85
+ with open(cache_file, "rb") as f:
86
  return pickle.load(f)
87
  else:
88
  return None
89
 
90
 
91
  def save_data_map_to_cache(data_map, cache_file):
92
+ with open(cache_file, "wb") as f:
93
  pickle.dump(data_map, f)
94
 
95
 
 
98
 
99
 
100
  if data_map is None:
101
+ my_snapshot_download(
102
+ repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
103
+ )
104
+ my_snapshot_download(
105
+ repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
106
+ )
107
 
108
  result_path_lst = find_json_files(EVAL_RESULTS_PATH_BACKEND)
109
  request_path_lst = find_json_files(EVAL_REQUESTS_PATH_BACKEND)
 
111
  model_name_to_model_map = {}
112
 
113
  for path in request_path_lst:
114
+ with open(path, "r") as f:
115
  data = json.load(f)
116
  model_name_to_model_map[data["model"]] = data
117
 
 
121
  data_map = {}
122
 
123
  for path in result_path_lst:
124
+ with open(path, "r") as f:
125
  data = json.load(f)
126
  model_name = data["config"]["model_name"]
127
  for dataset_name, results_dict in data["results"].items():
 
131
 
132
  to_add = True
133
 
134
+ if "f1" in metric_name:
135
  to_add = False
136
 
137
+ if "stderr" in metric_name:
138
  to_add = False
139
 
140
+ if "memo-trap_v2" in dataset_name:
141
  to_add = False
142
 
143
+ if "faithdial" in dataset_name:
144
  to_add = False
145
 
146
+ if "truthfulqa_gen" in dataset_name:
147
  to_add = False
148
 
149
+ if "bertscore" in metric_name:
150
+ if "precision" not in metric_name:
151
  to_add = False
152
 
153
+ if "halueval" in dataset_name:
154
+ if "acc" not in metric_name:
155
  to_add = False
156
 
157
+ if "ifeval" in dataset_name:
158
+ if "prompt_level_strict_acc" not in metric_name:
159
  to_add = False
160
 
161
+ if "squad" in dataset_name:
162
  # to_add = False
163
+ if "best_exact" in metric_name:
164
  to_add = False
165
 
166
+ if "fever" in dataset_name:
167
  to_add = False
168
 
169
+ if ("xsum" in dataset_name or "cnn" in dataset_name) and "v2" not in dataset_name:
170
  to_add = False
171
 
172
  if isinstance(value, str):
 
176
  to_add = False
177
 
178
  if to_add:
179
+ if "rouge" in metric_name:
180
  value /= 100.0
181
 
182
+ if "squad" in dataset_name:
183
  value /= 100.0
184
 
185
  sanitised_metric_name = metric_name
186
  if "," in sanitised_metric_name:
187
+ sanitised_metric_name = sanitised_metric_name.split(",")[0]
188
  sanitised_metric_name = sanitise_metric(sanitised_metric_name)
189
  sanitised_dataset_name = sanitise_dataset(dataset_name)
190
 
191
+ model_dataset_metric_to_result_map[
192
+ (model_name, sanitised_dataset_name, sanitised_metric_name)
193
+ ] = value
194
 
195
  if model_name not in data_map:
196
  data_map[model_name] = {}
197
  data_map[model_name][(sanitised_dataset_name, sanitised_metric_name)] = value
198
 
199
+ print(
200
+ "model_name",
201
+ model_name,
202
+ "dataset_name",
203
+ sanitised_dataset_name,
204
+ "metric_name",
205
+ sanitised_metric_name,
206
+ "value",
207
+ value,
208
+ )
209
 
210
  save_data_map_to_cache(data_map, cache_file)
211
 
 
217
  if len(data_map[model_name]) < nb_max_metrics - 5:
218
  del data_map[model_name]
219
 
220
+ plot_type_lst = ["all", "summ", "qa", "instr", "detect", "rc"]
221
 
222
  for plot_type in plot_type_lst:
223
 
 
227
  if dataset_metric not in data_map_v2:
228
  data_map_v2[dataset_metric] = {}
229
 
230
+ if plot_type in {"all"}:
231
  to_add = True
232
+ if "ROUGE" in dataset_metric[1] and "ROUGE-L" not in dataset_metric[1]:
233
  to_add = False
234
+ if "SQuAD" in dataset_metric[0] and "EM" not in dataset_metric[1]:
235
  to_add = False
236
+ if "SelfCheckGPT" in dataset_metric[0] and "MAX" not in dataset_metric[1]:
237
  to_add = False
238
+ if "64-shot" in dataset_metric[0]:
239
  to_add = False
240
  if to_add is True:
241
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
242
+ elif plot_type in {"summ"}:
243
+ if "CNN" in dataset_metric[0] or "XSum" in dataset_metric[0]:
244
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
245
+ elif plot_type in {"qa"}:
246
+ if "TriviaQA" in dataset_metric[0] or "NQ" in dataset_metric[0] or "TruthfulQA" in dataset_metric[0]:
247
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
248
+ elif plot_type in {"instr"}:
249
+ if "MemoTrap" in dataset_metric[0] or "IFEval" in dataset_metric[0]:
250
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
251
+ elif plot_type in {"detect"}:
252
+ if "HaluEval" in dataset_metric[0] or "SelfCheck" in dataset_metric[0]:
253
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
254
+ elif plot_type in {"rc"}:
255
+ if "RACE" in dataset_metric[0] or "SQuAD" in dataset_metric[0]:
256
  data_map_v2[dataset_metric][model_name] = data_map[model_name][dataset_metric]
257
  else:
258
  assert False, f"Unknown plot type: {plot_type}"
259
 
260
  # df = pd.DataFrame.from_dict(data_map, orient='index') # Invert the y-axis (rows)
261
+ df = pd.DataFrame.from_dict(data_map_v2, orient="index") # Invert the y-axis (rows)
262
+ df.index = [", ".join(map(str, idx)) for idx in df.index]
263
 
264
  o_df = df.copy(deep=True)
265
 
 
278
 
279
  # Calculate dimensions based on the DataFrame size
280
  cell_height = 1.0 # Height of each cell in inches
281
+ cell_width = 1.0 # Width of each cell in inches
282
 
283
  n_rows = len(df.index) # Datasets and Metrics
284
  n_cols = len(df.columns) # Models
 
292
 
293
  sns.set_context("notebook", font_scale=1.3)
294
 
295
+ dendrogram_ratio = (0.1, 0.1)
296
 
297
+ if plot_type in {"detect"}:
298
  fig_width = cell_width * n_cols - 2
299
  fig_height = cell_height * n_rows + 5.2
300
+ dendrogram_ratio = (0.1, 0.2)
301
 
302
+ if plot_type in {"instr"}:
303
  fig_width = cell_width * n_cols - 2
304
  fig_height = cell_height * n_rows + 5.2
305
+ dendrogram_ratio = (0.1, 0.4)
306
 
307
+ if plot_type in {"qa"}:
308
  fig_width = cell_width * n_cols - 2
309
  fig_height = cell_height * n_rows + 4
310
+ dendrogram_ratio = (0.1, 0.2)
311
 
312
+ if plot_type in {"summ"}:
313
  fig_width = cell_width * n_cols - 2
314
  fig_height = cell_height * n_rows + 2.0
315
+ dendrogram_ratio = (0.1, 0.1)
316
  row_cluster = False
317
 
318
+ if plot_type in {"rc"}:
319
  fig_width = cell_width * n_cols - 2
320
  fig_height = cell_height * n_rows + 5.2
321
+ dendrogram_ratio = (0.1, 0.4)
322
 
323
+ print("figsize", (fig_width, fig_height))
324
 
325
+ o_df.to_json(f"plots/clustermap_{plot_type}.json", orient="split")
326
 
327
+ print(f"Generating the clustermaps for {plot_type}")
328
 
329
+ for cmap in [None, "coolwarm", "viridis"]:
330
+ fig = sns.clustermap(
331
+ df,
332
+ method="ward",
333
+ metric="euclidean",
334
+ cmap=cmap,
335
+ figsize=(fig_width, fig_height), # figsize=(24, 16),
336
+ annot=True,
337
+ mask=o_df.isnull(),
338
+ dendrogram_ratio=dendrogram_ratio,
339
+ fmt=".2f",
340
+ col_cluster=col_cluster,
341
+ row_cluster=row_cluster,
342
+ )
343
 
344
  # Adjust the size of the cells (less wide)
345
  plt.setp(fig.ax_heatmap.get_yticklabels(), rotation=0)
346
  plt.setp(fig.ax_heatmap.get_xticklabels(), rotation=90)
347
 
348
+ cmap_suffix = "" if cmap is None else f"_{cmap}"
349
 
350
  # Save the clustermap to file
351
+ fig.savefig(f"blog/figures/clustermap_{plot_type}{cmap_suffix}.pdf")
352
+ fig.savefig(f"blog/figures/clustermap_{plot_type}{cmap_suffix}.png")
353
+ fig.savefig(f"blog/figures/clustermap_{plot_type}{cmap_suffix}_t.png", transparent=True, facecolor="none")
cli/averitec-upload-cli.py CHANGED
@@ -2,11 +2,13 @@
2
 
3
  from datasets import load_dataset
4
 
5
- path = 'pminervini/averitec'
6
 
7
- ds = load_dataset("json",
8
- data_files={
9
- 'train': '/Users/pasquale/workspace/AVeriTeC/data/train.json',
10
- 'dev': '/Users/pasquale/workspace/AVeriTeC/data/dev.json'
11
- })
 
 
12
  ds.push_to_hub(path)
 
2
 
3
  from datasets import load_dataset
4
 
5
+ path = "pminervini/averitec"
6
 
7
+ ds = load_dataset(
8
+ "json",
9
+ data_files={
10
+ "train": "/Users/pasquale/workspace/AVeriTeC/data/train.json",
11
+ "dev": "/Users/pasquale/workspace/AVeriTeC/data/dev.json",
12
+ },
13
+ )
14
  ds.push_to_hub(path)
cli/beta-cli.py CHANGED
@@ -14,8 +14,12 @@ from src.leaderboard.read_evals import get_raw_eval_results
14
  from src.backend.manage_requests import EvalRequest
15
  from src.leaderboard.read_evals import EvalResult
16
 
17
- snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
18
- snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 
 
 
 
19
 
20
  PENDING_STATUS = "PENDING"
21
  RUNNING_STATUS = "RUNNING"
@@ -40,7 +44,9 @@ def request_to_result_name(request: EvalRequest) -> str:
40
 
41
 
42
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
43
- eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
44
  # Sort the evals by priority (first submitted first run)
45
  eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
46
 
@@ -49,8 +55,8 @@ eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_RE
49
  result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
50
  result_name_to_result = {r.eval_name: r for r in eval_results}
51
 
52
- print('Requests', sorted(result_name_to_request.keys()))
53
- print('Results', sorted(result_name_to_result.keys()))
54
 
55
  for eval_request in eval_requests:
56
  result_name: str = request_to_result_name(eval_request)
@@ -63,7 +69,7 @@ for eval_request in eval_requests:
63
  task_name = task.benchmark
64
 
65
  if task_name not in eval_result.results:
66
- print('RUN THIS ONE!', result_name, task_name)
67
 
68
  raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
69
  all_data_json = [v.to_dict() for v in raw_data if v.is_complete()]
 
14
  from src.backend.manage_requests import EvalRequest
15
  from src.leaderboard.read_evals import EvalResult
16
 
17
+ snapshot_download(
18
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
19
+ )
20
+ snapshot_download(
21
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
22
+ )
23
 
24
  PENDING_STATUS = "PENDING"
25
  RUNNING_STATUS = "RUNNING"
 
44
 
45
 
46
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
47
+ eval_requests: list[EvalRequest] = get_eval_requests(
48
+ job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
49
+ )
50
  # Sort the evals by priority (first submitted first run)
51
  eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
52
 
 
55
  result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
56
  result_name_to_result = {r.eval_name: r for r in eval_results}
57
 
58
+ print("Requests", sorted(result_name_to_request.keys()))
59
+ print("Results", sorted(result_name_to_result.keys()))
60
 
61
  for eval_request in eval_requests:
62
  result_name: str = request_to_result_name(eval_request)
 
69
  task_name = task.benchmark
70
 
71
  if task_name not in eval_result.results:
72
+ print("RUN THIS ONE!", result_name, task_name)
73
 
74
  raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
75
  all_data_json = [v.to_dict() for v in raw_data if v.is_complete()]
cli/completed-cli.py CHANGED
@@ -26,8 +26,12 @@ FAILED_STATUS = "FAILED"
26
 
27
  TASKS_HARNESS = [task.value for task in Tasks]
28
 
29
- snapshot_download(repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
30
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
 
 
 
 
31
 
32
 
33
  def request_to_result_name(request: EvalRequest) -> str:
@@ -48,9 +52,10 @@ def process_finished_requests() -> bool:
48
  if False:
49
  import os
50
  import dateutil
 
51
  model_result_filepaths = []
52
- results_path = f'{EVAL_RESULTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B'
53
- requests_path = f'{EVAL_REQUESTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B_eval_request_False_False_False.json'
54
 
55
  for root, _, files in os.walk(results_path):
56
  # We should only have json files in model results
@@ -72,7 +77,7 @@ def process_finished_requests() -> bool:
72
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
73
  eval_result.update_with_request_file(requests_path)
74
 
75
- print('XXX', eval_result)
76
 
77
  # Store results of same eval together
78
  eval_name = eval_result.eval_name
@@ -86,7 +91,9 @@ def process_finished_requests() -> bool:
86
  return True
87
 
88
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
89
- eval_requests: list[EvalRequest] = get_eval_requests(job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
90
  # Sort the evals by priority (first submitted first run)
91
  eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
92
 
@@ -94,9 +101,11 @@ def process_finished_requests() -> bool:
94
  # eval_requests = [r for r in eval_requests if 'neo-1.3B' in r.model]
95
 
96
  import random
 
97
  random.shuffle(eval_requests)
98
 
99
  from src.leaderboard.read_evals import get_raw_eval_results
 
100
  eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
101
 
102
  result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
@@ -107,7 +116,10 @@ def process_finished_requests() -> bool:
107
 
108
  # Check the corresponding result
109
  from typing import Optional
110
- eval_result: Optional[EvalResult] = result_name_to_result[result_name] if result_name in result_name_to_result else None
 
 
 
111
 
112
  # Iterate over tasks and, if we do not have results for a task, run the relevant evaluations
113
  for task in TASKS_HARNESS:
@@ -117,7 +129,7 @@ def process_finished_requests() -> bool:
117
  eval_request: EvalRequest = result_name_to_request[result_name]
118
 
119
  # print(eval_result)
120
- print(result_name, 'is incomplete -- missing task:', task_name, eval_result, eval_request.likes)
121
 
122
 
123
  if __name__ == "__main__":
 
26
 
27
  TASKS_HARNESS = [task.value for task in Tasks]
28
 
29
+ snapshot_download(
30
+ repo_id=RESULTS_REPO, revision="main", local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset", max_workers=60
31
+ )
32
+ snapshot_download(
33
+ repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
34
+ )
35
 
36
 
37
  def request_to_result_name(request: EvalRequest) -> str:
 
52
  if False:
53
  import os
54
  import dateutil
55
+
56
  model_result_filepaths = []
57
+ results_path = f"{EVAL_RESULTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B"
58
+ requests_path = f"{EVAL_REQUESTS_PATH_BACKEND}/EleutherAI/gpt-neo-1.3B_eval_request_False_False_False.json"
59
 
60
  for root, _, files in os.walk(results_path):
61
  # We should only have json files in model results
 
77
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
78
  eval_result.update_with_request_file(requests_path)
79
 
80
+ print("XXX", eval_result)
81
 
82
  # Store results of same eval together
83
  eval_name = eval_result.eval_name
 
91
  return True
92
 
93
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
94
+ eval_requests: list[EvalRequest] = get_eval_requests(
95
+ job_status=current_finished_status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
96
+ )
97
  # Sort the evals by priority (first submitted first run)
98
  eval_requests: list[EvalRequest] = sort_models_by_priority(api=API, models=eval_requests)
99
 
 
101
  # eval_requests = [r for r in eval_requests if 'neo-1.3B' in r.model]
102
 
103
  import random
104
+
105
  random.shuffle(eval_requests)
106
 
107
  from src.leaderboard.read_evals import get_raw_eval_results
108
+
109
  eval_results: list[EvalResult] = get_raw_eval_results(EVAL_RESULTS_PATH_BACKEND, EVAL_REQUESTS_PATH_BACKEND)
110
 
111
  result_name_to_request = {request_to_result_name(r): r for r in eval_requests}
 
116
 
117
  # Check the corresponding result
118
  from typing import Optional
119
+
120
+ eval_result: Optional[EvalResult] = (
121
+ result_name_to_result[result_name] if result_name in result_name_to_result else None
122
+ )
123
 
124
  # Iterate over tasks and, if we do not have results for a task, run the relevant evaluations
125
  for task in TASKS_HARNESS:
 
129
  eval_request: EvalRequest = result_name_to_request[result_name]
130
 
131
  # print(eval_result)
132
+ print(result_name, "is incomplete -- missing task:", task_name, eval_result, eval_request.likes)
133
 
134
 
135
  if __name__ == "__main__":
cli/eval-cli.py CHANGED
@@ -35,12 +35,11 @@ def main():
35
  status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
36
 
37
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
38
- eval_requests: list[EvalRequest] = get_eval_requests(job_status=status,
39
- hf_repo=QUEUE_REPO,
40
- local_dir=EVAL_REQUESTS_PATH_BACKEND,
41
- do_download=False)
42
  # eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
43
- eval_request = [r for r in eval_requests if 'meta-llama/Llama-2-7b-hf' in r.model][0]
44
 
45
  # my_task = Task("memo-trap", "acc", "memo-trap", 0)
46
  # my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
@@ -56,6 +55,7 @@ def main():
56
 
57
  eval_logger = utils.eval_logger
58
  import logging
 
59
  eval_logger.setLevel(getattr(logging, "DEBUG"))
60
 
61
  TASKS_HARNESS = [my_task]
@@ -75,9 +75,19 @@ def main():
75
  import torch
76
 
77
  # breakpoint()
78
- results = evaluator.simple_evaluate(model="hf", model_args=eval_request.get_model_args(), tasks=[task.benchmark], num_fewshot=task.num_fewshot,
79
- batch_size=1, device="mps", use_cache=None, limit=2, write_out=True, task_manager=task_manager)
80
- print('AAA', results["results"])
 
 
 
 
 
 
 
 
 
 
81
 
82
  breakpoint()
83
 
 
35
  status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
36
 
37
  # Get all eval request that are FINISHED, if you want to run other evals, change this parameter
38
+ eval_requests: list[EvalRequest] = get_eval_requests(
39
+ job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND, do_download=False
40
+ )
 
41
  # eval_request = [r for r in eval_requests if 'bloom-560m' in r.model][0]
42
+ eval_request = [r for r in eval_requests if "meta-llama/Llama-2-7b-hf" in r.model][0]
43
 
44
  # my_task = Task("memo-trap", "acc", "memo-trap", 0)
45
  # my_task = Task("selfcheckgpt", "avg-selfcheckgpt", "SGPT", 2)
 
55
 
56
  eval_logger = utils.eval_logger
57
  import logging
58
+
59
  eval_logger.setLevel(getattr(logging, "DEBUG"))
60
 
61
  TASKS_HARNESS = [my_task]
 
75
  import torch
76
 
77
  # breakpoint()
78
+ results = evaluator.simple_evaluate(
79
+ model="hf",
80
+ model_args=eval_request.get_model_args(),
81
+ tasks=[task.benchmark],
82
+ num_fewshot=task.num_fewshot,
83
+ batch_size=1,
84
+ device="mps",
85
+ use_cache=None,
86
+ limit=2,
87
+ write_out=True,
88
+ task_manager=task_manager,
89
+ )
90
+ print("AAA", results["results"])
91
 
92
  breakpoint()
93
 
cli/fever-upload-cli.py CHANGED
@@ -18,12 +18,9 @@ def convert(list_of_dicts):
18
 
19
 
20
  v10 = load_dataset("fever", "v1.0")
21
- name_lst = ['train', 'labelled_dev']
22
 
23
- old_to_new_label_map = {
24
- 'SUPPORTS': 'supported',
25
- 'REFUTES': 'refuted'
26
- }
27
 
28
  data_map = {}
29
 
@@ -31,28 +28,28 @@ for name in name_lst:
31
  instance_lst = []
32
 
33
  for entry in tqdm(v10[name]):
34
- id_ = entry['id']
35
- label = entry['label']
36
- claim = entry['claim']
37
 
38
- evidence_id = entry['evidence_id']
39
- evidence_wiki_url = entry['evidence_wiki_url']
40
 
41
  if evidence_id != -1:
42
- assert label in {'SUPPORTS', 'REFUTES'}
43
 
44
- instance = {'id': id_, 'label': old_to_new_label_map[label], 'claim': claim}
45
  instance_lst.append(instance)
46
 
47
- key = 'dev' if name in {'labelled_dev'} else name
48
 
49
- instance_lst = sorted([dict(t) for t in {tuple(d.items()) for d in instance_lst}], key=lambda d: d['claim'])
50
 
51
  label_to_instance_lst = {}
52
  for e in instance_lst:
53
- if e['label'] not in label_to_instance_lst:
54
- label_to_instance_lst[e['label']] = []
55
- label_to_instance_lst[e['label']].append(e)
56
 
57
  min_len = min(len(v) for k, v in label_to_instance_lst.items())
58
 
@@ -63,7 +60,7 @@ for name in name_lst:
63
  random.Random(42).shuffle(new_instance_lst)
64
  data_map[key] = new_instance_lst
65
 
66
- ds_path = 'pminervini/hl-fever'
67
 
68
  task_to_ds_map = {k: Dataset.from_dict(convert(v)) for k, v in data_map.items()}
69
  ds_dict = DatasetDict(task_to_ds_map)
 
18
 
19
 
20
  v10 = load_dataset("fever", "v1.0")
21
+ name_lst = ["train", "labelled_dev"]
22
 
23
+ old_to_new_label_map = {"SUPPORTS": "supported", "REFUTES": "refuted"}
 
 
 
24
 
25
  data_map = {}
26
 
 
28
  instance_lst = []
29
 
30
  for entry in tqdm(v10[name]):
31
+ id_ = entry["id"]
32
+ label = entry["label"]
33
+ claim = entry["claim"]
34
 
35
+ evidence_id = entry["evidence_id"]
36
+ evidence_wiki_url = entry["evidence_wiki_url"]
37
 
38
  if evidence_id != -1:
39
+ assert label in {"SUPPORTS", "REFUTES"}
40
 
41
+ instance = {"id": id_, "label": old_to_new_label_map[label], "claim": claim}
42
  instance_lst.append(instance)
43
 
44
+ key = "dev" if name in {"labelled_dev"} else name
45
 
46
+ instance_lst = sorted([dict(t) for t in {tuple(d.items()) for d in instance_lst}], key=lambda d: d["claim"])
47
 
48
  label_to_instance_lst = {}
49
  for e in instance_lst:
50
+ if e["label"] not in label_to_instance_lst:
51
+ label_to_instance_lst[e["label"]] = []
52
+ label_to_instance_lst[e["label"]].append(e)
53
 
54
  min_len = min(len(v) for k, v in label_to_instance_lst.items())
55
 
 
60
  random.Random(42).shuffle(new_instance_lst)
61
  data_map[key] = new_instance_lst
62
 
63
+ ds_path = "pminervini/hl-fever"
64
 
65
  task_to_ds_map = {k: Dataset.from_dict(convert(v)) for k, v in data_map.items()}
66
  ds_dict = DatasetDict(task_to_ds_map)
cli/fix-requests-cli.py CHANGED
@@ -10,12 +10,12 @@ from huggingface_hub import HfApi
10
  def find_json_files(directory):
11
  matches = []
12
  for root, dirnames, filenames in os.walk(directory):
13
- for filename in fnmatch.filter(filenames, '*.json'):
14
  matches.append(os.path.join(root, filename))
15
  return matches
16
 
17
 
18
- directory_path = '/Users/pasquale/workspace/eval/requests'
19
  json_files = find_json_files(directory_path)
20
 
21
  api = HfApi()
@@ -26,29 +26,29 @@ model_lst = [m for m in model_lst]
26
  id_to_model = {m.id: m for m in model_lst}
27
 
28
  for path in json_files:
29
- with open(path, 'r') as fr:
30
  data = json.load(fr)
31
 
32
- model_id = data['model']
33
  if model_id in id_to_model:
34
  model = id_to_model[model_id]
35
 
36
  to_overwrite = False
37
 
38
- is_finetuned = any(tag.startswith('base_model:') for tag in id_to_model[data['model']].tags)
39
 
40
  if is_finetuned:
41
  data["model_type"] = "fine-tuned"
42
  to_overwrite = True
43
 
44
- is_instruction_tuned = ('nstruct' in model_id) or ('chat' in model_id)
45
  if is_instruction_tuned:
46
  data["model_type"] = "instruction-tuned"
47
  to_overwrite = True
48
 
49
  if to_overwrite is True:
50
- with open(path, 'w') as fw:
51
  json.dump(data, fw)
52
 
53
  else:
54
- print(f'Model {model_id} not found')
 
10
  def find_json_files(directory):
11
  matches = []
12
  for root, dirnames, filenames in os.walk(directory):
13
+ for filename in fnmatch.filter(filenames, "*.json"):
14
  matches.append(os.path.join(root, filename))
15
  return matches
16
 
17
 
18
+ directory_path = "/Users/pasquale/workspace/eval/requests"
19
  json_files = find_json_files(directory_path)
20
 
21
  api = HfApi()
 
26
  id_to_model = {m.id: m for m in model_lst}
27
 
28
  for path in json_files:
29
+ with open(path, "r") as fr:
30
  data = json.load(fr)
31
 
32
+ model_id = data["model"]
33
  if model_id in id_to_model:
34
  model = id_to_model[model_id]
35
 
36
  to_overwrite = False
37
 
38
+ is_finetuned = any(tag.startswith("base_model:") for tag in id_to_model[data["model"]].tags)
39
 
40
  if is_finetuned:
41
  data["model_type"] = "fine-tuned"
42
  to_overwrite = True
43
 
44
+ is_instruction_tuned = ("nstruct" in model_id) or ("chat" in model_id)
45
  if is_instruction_tuned:
46
  data["model_type"] = "instruction-tuned"
47
  to_overwrite = True
48
 
49
  if to_overwrite is True:
50
+ with open(path, "w") as fw:
51
  json.dump(data, fw)
52
 
53
  else:
54
+ print(f"Model {model_id} not found")
cli/halueval-upload-cli.py CHANGED
@@ -6,20 +6,20 @@ import requests
6
  from datasets import load_dataset, Dataset, DatasetDict
7
 
8
 
9
- path = 'pminervini/HaluEval'
10
 
11
  API_URL = f"https://datasets-server.huggingface.co/splits?dataset={path}"
12
  response = requests.get(API_URL)
13
  res_json = response.json()
14
 
15
- gold_splits = {'dialogue', 'qa', 'summarization', 'general'}
16
 
17
- available_splits = {split['config'] for split in res_json['splits']} if 'splits' in res_json else set()
18
 
19
  name_to_ds = dict()
20
 
21
  for name in gold_splits:
22
- ds = load_dataset("json", data_files={'data': f"data/{name}_data.json"})
23
  name_to_ds[name] = ds
24
  # if name not in available_splits:
25
  ds.push_to_hub(path, config_name=name)
@@ -35,38 +35,38 @@ def list_to_dict(lst: list) -> dict:
35
  return res
36
 
37
 
38
- for name in (gold_splits - {'general'}):
39
  random.seed(42)
40
  ds = name_to_ds[name]
41
  new_entry_lst = []
42
-
43
- for entry in ds['data']:
44
  is_hallucinated = random.random() > 0.5
45
  new_entry = None
46
- if name in {'qa'}:
47
  new_entry = {
48
- 'knowledge': entry['knowledge'],
49
- 'question': entry['question'],
50
- 'answer': entry[f'{"hallucinated" if is_hallucinated else "right"}_answer'],
51
- 'hallucination': 'yes' if is_hallucinated else 'no'
52
  }
53
- if name in {'dialogue'}:
54
  new_entry = {
55
- 'knowledge': entry['knowledge'],
56
- 'dialogue_history': entry['dialogue_history'],
57
- 'response': entry[f'{"hallucinated" if is_hallucinated else "right"}_response'],
58
- 'hallucination': 'yes' if is_hallucinated else 'no'
59
  }
60
- if name in {'summarization'}:
61
  new_entry = {
62
- 'document': entry['document'],
63
- 'summary': entry[f'{"hallucinated" if is_hallucinated else "right"}_summary'],
64
- 'hallucination': 'yes' if is_hallucinated else 'no'
65
  }
66
  assert new_entry is not None
67
  new_entry_lst += [new_entry]
68
  new_ds_map = list_to_dict(new_entry_lst)
69
  new_ds = Dataset.from_dict(new_ds_map)
70
- new_dsd = DatasetDict({'data': new_ds})
71
 
72
- new_dsd.push_to_hub(path, config_name=f'{name}_samples')
 
6
  from datasets import load_dataset, Dataset, DatasetDict
7
 
8
 
9
+ path = "pminervini/HaluEval"
10
 
11
  API_URL = f"https://datasets-server.huggingface.co/splits?dataset={path}"
12
  response = requests.get(API_URL)
13
  res_json = response.json()
14
 
15
+ gold_splits = {"dialogue", "qa", "summarization", "general"}
16
 
17
+ available_splits = {split["config"] for split in res_json["splits"]} if "splits" in res_json else set()
18
 
19
  name_to_ds = dict()
20
 
21
  for name in gold_splits:
22
+ ds = load_dataset("json", data_files={"data": f"data/{name}_data.json"})
23
  name_to_ds[name] = ds
24
  # if name not in available_splits:
25
  ds.push_to_hub(path, config_name=name)
 
35
  return res
36
 
37
 
38
+ for name in gold_splits - {"general"}:
39
  random.seed(42)
40
  ds = name_to_ds[name]
41
  new_entry_lst = []
42
+
43
+ for entry in ds["data"]:
44
  is_hallucinated = random.random() > 0.5
45
  new_entry = None
46
+ if name in {"qa"}:
47
  new_entry = {
48
+ "knowledge": entry["knowledge"],
49
+ "question": entry["question"],
50
+ "answer": entry[f'{"hallucinated" if is_hallucinated else "right"}_answer'],
51
+ "hallucination": "yes" if is_hallucinated else "no",
52
  }
53
+ if name in {"dialogue"}:
54
  new_entry = {
55
+ "knowledge": entry["knowledge"],
56
+ "dialogue_history": entry["dialogue_history"],
57
+ "response": entry[f'{"hallucinated" if is_hallucinated else "right"}_response'],
58
+ "hallucination": "yes" if is_hallucinated else "no",
59
  }
60
+ if name in {"summarization"}:
61
  new_entry = {
62
+ "document": entry["document"],
63
+ "summary": entry[f'{"hallucinated" if is_hallucinated else "right"}_summary'],
64
+ "hallucination": "yes" if is_hallucinated else "no",
65
  }
66
  assert new_entry is not None
67
  new_entry_lst += [new_entry]
68
  new_ds_map = list_to_dict(new_entry_lst)
69
  new_ds = Dataset.from_dict(new_ds_map)
70
+ new_dsd = DatasetDict({"data": new_ds})
71
 
72
+ new_dsd.push_to_hub(path, config_name=f"{name}_samples")
cli/isp-upload-cli.py CHANGED
@@ -5,16 +5,16 @@ import os
5
 
6
  from datasets import load_dataset
7
 
8
- folder_path = 'isp-data-json/' # Replace with your folder path
9
 
10
  # Search for all .json files in the folder
11
- json_files = glob.glob(os.path.join(folder_path, '*.jsonl'))
12
 
13
- path = 'pminervini/inverse-scaling'
14
 
15
  for json_path in json_files:
16
  base_name = os.path.basename(json_path)
17
  name = base_name.split("_")[0]
18
 
19
- ds = load_dataset("json", data_files={'data': json_path})
20
  ds.push_to_hub(path, config_name=name)
 
5
 
6
  from datasets import load_dataset
7
 
8
+ folder_path = "isp-data-json/" # Replace with your folder path
9
 
10
  # Search for all .json files in the folder
11
+ json_files = glob.glob(os.path.join(folder_path, "*.jsonl"))
12
 
13
+ path = "pminervini/inverse-scaling"
14
 
15
  for json_path in json_files:
16
  base_name = os.path.basename(json_path)
17
  name = base_name.split("_")[0]
18
 
19
+ ds = load_dataset("json", data_files={"data": json_path})
20
  ds.push_to_hub(path, config_name=name)
cli/nqswap-upload-cli.py CHANGED
@@ -2,11 +2,7 @@
2
 
3
  from datasets import load_dataset
4
 
5
- path = 'pminervini/NQ-Swap'
6
 
7
- ds = load_dataset("json",
8
- data_files={
9
- 'original': 'nqswap/original.jsonl',
10
- 'substituted': 'nqswap/substituted.jsonl'
11
- })
12
  ds.push_to_hub(path)
 
2
 
3
  from datasets import load_dataset
4
 
5
+ path = "pminervini/NQ-Swap"
6
 
7
+ ds = load_dataset("json", data_files={"original": "nqswap/original.jsonl", "substituted": "nqswap/substituted.jsonl"})
 
 
 
 
8
  ds.push_to_hub(path)
cli/shroom-upload-cli.py CHANGED
@@ -4,9 +4,9 @@ import json
4
  from datasets import Dataset, DatasetDict
5
 
6
  file_path = "shroom-data/val.model-agnostic.json"
7
- ds_path = 'pminervini/shroom'
8
 
9
- with open(file_path, 'r') as file:
10
  data = json.load(file)
11
 
12
 
@@ -15,7 +15,7 @@ def convert(list_of_dicts):
15
  for d in list_of_dicts:
16
  for key, value in d.items():
17
  dict_of_lists.setdefault(key, []).append(value)
18
- return dict_of_lists
19
 
20
 
21
  task_to_data_map = {}
 
4
  from datasets import Dataset, DatasetDict
5
 
6
  file_path = "shroom-data/val.model-agnostic.json"
7
+ ds_path = "pminervini/shroom"
8
 
9
+ with open(file_path, "r") as file:
10
  data = json.load(file)
11
 
12
 
 
15
  for d in list_of_dicts:
16
  for key, value in d.items():
17
  dict_of_lists.setdefault(key, []).append(value)
18
+ return dict_of_lists
19
 
20
 
21
  task_to_data_map = {}
cli/submit-cli.py CHANGED
@@ -15,7 +15,9 @@ from src.backend.manage_requests import get_eval_requests
15
  from src.backend.manage_requests import EvalRequest
16
 
17
 
18
- def add_new_eval(model: str, base_model: str, revision: str, precision: str, private: bool, weight_type: str, model_type: str):
 
 
19
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
20
 
21
  user_name = ""
@@ -37,7 +39,9 @@ def add_new_eval(model: str, base_model: str, revision: str, precision: str, pri
37
 
38
  # Is the model on the hub?
39
  if weight_type in ["Delta", "Adapter"]:
40
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
 
 
41
  if not base_model_on_hub:
42
  print(f'Base model "{base_model}" {error}')
43
  return
@@ -57,7 +61,7 @@ def add_new_eval(model: str, base_model: str, revision: str, precision: str, pri
57
 
58
  model_size = get_model_size(model_info=model_info, precision=precision)
59
 
60
- license = 'none'
61
  try:
62
  license = model_info.cardData["license"]
63
  except Exception:
@@ -101,13 +105,20 @@ def add_new_eval(model: str, base_model: str, revision: str, precision: str, pri
101
  f.write(json.dumps(eval_entry))
102
 
103
  print("Uploading eval file")
104
- API.upload_file(path_or_fileobj=out_path, path_in_repo=out_path.split("eval-queue/")[1],
105
- repo_id=QUEUE_REPO, repo_type="dataset", commit_message=f"Add {model} to eval queue")
 
 
 
 
 
106
 
107
  # Remove the local file
108
  os.remove(out_path)
109
 
110
- print("Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list.")
 
 
111
  return
112
 
113
 
@@ -122,12 +133,14 @@ def main():
122
  def custom_filter(m) -> bool:
123
  # res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False
124
  # res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False and 'mistralai/' in m.id
125
- res = 'mistralai/' in m.id
126
  return res
127
 
128
  filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
129
 
130
- snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60)
 
 
131
 
132
  PENDING_STATUS = "PENDING"
133
  RUNNING_STATUS = "RUNNING"
@@ -137,7 +150,9 @@ def main():
137
  status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
138
 
139
  # Get all eval requests
140
- eval_requests: list[EvalRequest] = get_eval_requests(job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND)
 
 
141
 
142
  requested_model_names = {e.model for e in eval_requests}
143
 
@@ -146,25 +161,33 @@ def main():
146
  for i in range(min(200, len(filtered_model_lst))):
147
  model = filtered_model_lst[i]
148
 
149
- print(f'Considering {model.id} ..')
150
 
151
- is_finetuned = any(tag.startswith('base_model:') for tag in model.tags)
152
 
153
- model_type = 'pretrained'
154
  if is_finetuned:
155
  model_type = "fine-tuned"
156
 
157
- is_instruction_tuned = 'nstruct' in model.id
158
  if is_instruction_tuned:
159
  model_type = "instruction-tuned"
160
 
161
  if model.id not in requested_model_names:
162
 
163
- if 'mage' not in model.id:
164
- add_new_eval(model=model.id, base_model='', revision='main', precision='float32', private=False, weight_type='Original', model_type=model_type)
 
 
 
 
 
 
 
 
165
  time.sleep(10)
166
  else:
167
- print(f'Model {model.id} already added, not adding it to the queue again.')
168
 
169
 
170
  if __name__ == "__main__":
 
15
  from src.backend.manage_requests import EvalRequest
16
 
17
 
18
+ def add_new_eval(
19
+ model: str, base_model: str, revision: str, precision: str, private: bool, weight_type: str, model_type: str
20
+ ):
21
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
22
 
23
  user_name = ""
 
39
 
40
  # Is the model on the hub?
41
  if weight_type in ["Delta", "Adapter"]:
42
+ base_model_on_hub, error, _ = is_model_on_hub(
43
+ model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True
44
+ )
45
  if not base_model_on_hub:
46
  print(f'Base model "{base_model}" {error}')
47
  return
 
61
 
62
  model_size = get_model_size(model_info=model_info, precision=precision)
63
 
64
+ license = "none"
65
  try:
66
  license = model_info.cardData["license"]
67
  except Exception:
 
105
  f.write(json.dumps(eval_entry))
106
 
107
  print("Uploading eval file")
108
+ API.upload_file(
109
+ path_or_fileobj=out_path,
110
+ path_in_repo=out_path.split("eval-queue/")[1],
111
+ repo_id=QUEUE_REPO,
112
+ repo_type="dataset",
113
+ commit_message=f"Add {model} to eval queue",
114
+ )
115
 
116
  # Remove the local file
117
  os.remove(out_path)
118
 
119
+ print(
120
+ "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
121
+ )
122
  return
123
 
124
 
 
133
  def custom_filter(m) -> bool:
134
  # res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False
135
  # res = m.pipeline_tag in {'text-generation'} and 'en' in m.tags and m.private is False and 'mistralai/' in m.id
136
+ res = "mistralai/" in m.id
137
  return res
138
 
139
  filtered_model_lst = sorted([m for m in model_lst if custom_filter(m)], key=lambda m: m.downloads, reverse=True)
140
 
141
+ snapshot_download(
142
+ repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
143
+ )
144
 
145
  PENDING_STATUS = "PENDING"
146
  RUNNING_STATUS = "RUNNING"
 
150
  status = [PENDING_STATUS, RUNNING_STATUS, FINISHED_STATUS, FAILED_STATUS]
151
 
152
  # Get all eval requests
153
+ eval_requests: list[EvalRequest] = get_eval_requests(
154
+ job_status=status, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND
155
+ )
156
 
157
  requested_model_names = {e.model for e in eval_requests}
158
 
 
161
  for i in range(min(200, len(filtered_model_lst))):
162
  model = filtered_model_lst[i]
163
 
164
+ print(f"Considering {model.id} ..")
165
 
166
+ is_finetuned = any(tag.startswith("base_model:") for tag in model.tags)
167
 
168
+ model_type = "pretrained"
169
  if is_finetuned:
170
  model_type = "fine-tuned"
171
 
172
+ is_instruction_tuned = "nstruct" in model.id
173
  if is_instruction_tuned:
174
  model_type = "instruction-tuned"
175
 
176
  if model.id not in requested_model_names:
177
 
178
+ if "mage" not in model.id:
179
+ add_new_eval(
180
+ model=model.id,
181
+ base_model="",
182
+ revision="main",
183
+ precision="float32",
184
+ private=False,
185
+ weight_type="Original",
186
+ model_type=model_type,
187
+ )
188
  time.sleep(10)
189
  else:
190
+ print(f"Model {model.id} already added, not adding it to the queue again.")
191
 
192
 
193
  if __name__ == "__main__":
cli/sync-open-llm-cli.py CHANGED
@@ -10,6 +10,7 @@ from src.envs import QUEUE_REPO, API
10
  from src.envs import EVAL_REQUESTS_PATH_OPEN_LLM, QUEUE_REPO_OPEN_LLM
11
  from src.utils import my_snapshot_download
12
 
 
13
  def my_set_eval_request(api, json_filepath, hf_repo, local_dir):
14
  for i in range(10):
15
  try:
@@ -29,8 +30,12 @@ def set_eval_request(api: HfApi, json_filepath: str, hf_repo: str, local_dir: st
29
  with open(json_filepath, "w") as f:
30
  f.write(json.dumps(data))
31
 
32
- api.upload_file(path_or_fileobj=json_filepath, path_in_repo=json_filepath.replace(local_dir, ""),
33
- repo_id=hf_repo, repo_type="dataset")
 
 
 
 
34
 
35
 
36
  def get_request_file_for_model(data, requests_path):
@@ -54,6 +59,7 @@ def get_request_file_for_model(data, requests_path):
54
  request_file = tmp_request_file
55
  return request_file
56
 
 
57
  def update_model_type(data, requests_path):
58
  open_llm_request_file = get_request_file_for_model(data, requests_path)
59
 
@@ -71,21 +77,33 @@ def read_and_write_json_files(directory, requests_path_open_llm):
71
  for subdir, dirs, files in tqdm(os.walk(directory), desc="updating model type according to open llm leaderboard"):
72
  for file in files:
73
  # Check if the file is a JSON file
74
- if file.endswith('.json'):
75
  file_path = os.path.join(subdir, file)
76
  # Open and read the JSON file
77
- with open(file_path, 'r') as json_file:
78
  data = json.load(json_file)
79
  sucess, data = update_model_type(data, requests_path_open_llm)
80
  if sucess:
81
- with open(file_path, 'w') as json_file:
82
  json.dump(data, json_file)
83
- my_set_eval_request(api=API, json_filepath=file_path, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND_SYNC)
84
-
85
-
86
 
87
 
88
  if __name__ == "__main__":
89
- my_snapshot_download(repo_id=QUEUE_REPO_OPEN_LLM, revision="main", local_dir=EVAL_REQUESTS_PATH_OPEN_LLM, repo_type="dataset", max_workers=60)
90
- my_snapshot_download(repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND_SYNC, repo_type="dataset", max_workers=60)
91
- read_and_write_json_files(EVAL_REQUESTS_PATH_BACKEND_SYNC, EVAL_REQUESTS_PATH_OPEN_LLM)
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  from src.envs import EVAL_REQUESTS_PATH_OPEN_LLM, QUEUE_REPO_OPEN_LLM
11
  from src.utils import my_snapshot_download
12
 
13
+
14
  def my_set_eval_request(api, json_filepath, hf_repo, local_dir):
15
  for i in range(10):
16
  try:
 
30
  with open(json_filepath, "w") as f:
31
  f.write(json.dumps(data))
32
 
33
+ api.upload_file(
34
+ path_or_fileobj=json_filepath,
35
+ path_in_repo=json_filepath.replace(local_dir, ""),
36
+ repo_id=hf_repo,
37
+ repo_type="dataset",
38
+ )
39
 
40
 
41
  def get_request_file_for_model(data, requests_path):
 
59
  request_file = tmp_request_file
60
  return request_file
61
 
62
+
63
  def update_model_type(data, requests_path):
64
  open_llm_request_file = get_request_file_for_model(data, requests_path)
65
 
 
77
  for subdir, dirs, files in tqdm(os.walk(directory), desc="updating model type according to open llm leaderboard"):
78
  for file in files:
79
  # Check if the file is a JSON file
80
+ if file.endswith(".json"):
81
  file_path = os.path.join(subdir, file)
82
  # Open and read the JSON file
83
+ with open(file_path, "r") as json_file:
84
  data = json.load(json_file)
85
  sucess, data = update_model_type(data, requests_path_open_llm)
86
  if sucess:
87
+ with open(file_path, "w") as json_file:
88
  json.dump(data, json_file)
89
+ my_set_eval_request(
90
+ api=API, json_filepath=file_path, hf_repo=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH_BACKEND_SYNC
91
+ )
92
 
93
 
94
  if __name__ == "__main__":
95
+ my_snapshot_download(
96
+ repo_id=QUEUE_REPO_OPEN_LLM,
97
+ revision="main",
98
+ local_dir=EVAL_REQUESTS_PATH_OPEN_LLM,
99
+ repo_type="dataset",
100
+ max_workers=60,
101
+ )
102
+ my_snapshot_download(
103
+ repo_id=QUEUE_REPO,
104
+ revision="main",
105
+ local_dir=EVAL_REQUESTS_PATH_BACKEND_SYNC,
106
+ repo_type="dataset",
107
+ max_workers=60,
108
+ )
109
+ read_and_write_json_files(EVAL_REQUESTS_PATH_BACKEND_SYNC, EVAL_REQUESTS_PATH_OPEN_LLM)
cli/truefalse-upload-cli.py CHANGED
@@ -5,11 +5,11 @@ import os
5
 
6
  from datasets import load_dataset
7
 
8
- path = 'pminervini/true-false'
9
- folder_path = 'true-false-data/' # Replace with your folder path
10
 
11
  # Search for all .json files in the folder
12
- csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
13
 
14
  ds = load_dataset("csv", data_files={os.path.basename(path).split("_")[0]: path for path in csv_files})
15
  ds.push_to_hub(path)
 
5
 
6
  from datasets import load_dataset
7
 
8
+ path = "pminervini/true-false"
9
+ folder_path = "true-false-data/" # Replace with your folder path
10
 
11
  # Search for all .json files in the folder
12
+ csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
13
 
14
  ds = load_dataset("csv", data_files={os.path.basename(path).split("_")[0]: path for path in csv_files})
15
  ds.push_to_hub(path)
src/backend/envs.py CHANGED
@@ -63,6 +63,6 @@ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
63
  EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
64
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
65
 
66
- DEVICE = "cuda" if torch.cuda.is_available() else 'cpu'
67
 
68
  LIMIT = None # Testing; needs to be None
 
63
  EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
64
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
65
 
66
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
67
 
68
  LIMIT = None # Testing; needs to be None
src/backend/huggingface_generate_until.py CHANGED
@@ -5,7 +5,8 @@ import transformers
5
  from lm_eval.models.huggingface import HFLM
6
  from lm_eval.api.registry import register_model
7
 
8
- @register_model('hf-chat')
 
9
  class HFLMwithChatTemplate(HFLM):
10
  def __init__(self, use_chat_template=True, **kwargs):
11
  super().__init__(**kwargs)
@@ -49,9 +50,7 @@ class HFLMwithChatTemplate(HFLM):
49
  )
50
  if left_truncate_len:
51
  encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
52
- encoding["attention_mask"] = encoding["attention_mask"][
53
- :, -left_truncate_len:
54
- ]
55
  self.tokenizer.padding_side = old_padding_side
56
 
57
- return encoding["input_ids"], encoding["attention_mask"]
 
5
  from lm_eval.models.huggingface import HFLM
6
  from lm_eval.api.registry import register_model
7
 
8
+
9
+ @register_model("hf-chat")
10
  class HFLMwithChatTemplate(HFLM):
11
  def __init__(self, use_chat_template=True, **kwargs):
12
  super().__init__(**kwargs)
 
50
  )
51
  if left_truncate_len:
52
  encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
53
+ encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:]
 
 
54
  self.tokenizer.padding_side = old_padding_side
55
 
56
+ return encoding["input_ids"], encoding["attention_mask"]
src/backend/manage_requests.py CHANGED
@@ -17,24 +17,27 @@ class EvalRequest:
17
  weight_type: str = "Original"
18
  model_type: str = "" # pretrained, finetuned, with RL
19
  precision: str = "" # float16, bfloat16
20
- base_model: Optional[str] = None # for adapter models
21
- revision: str = "main" # commit
22
- submitted_time: Optional[str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
 
 
23
  model_type: Optional[str] = None
24
  likes: Optional[int] = 0
25
  params: Optional[int] = None
26
  license: Optional[str] = ""
 
27
  def get_model_args(self) -> str:
28
- model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
29
 
30
  if self.precision in ["float16", "float32", "bfloat16"]:
31
  model_args += f",dtype={self.precision}"
32
- # Quantized models need some added config, the install of bits and bytes, etc
33
- #elif self.precision == "8bit":
34
- # model_args += ",load_in_8bit=True"
35
- #elif self.precision == "4bit":
36
- # model_args += ",load_in_4bit=True"
37
- #elif self.precision == "GPTQ":
38
  # A GPTQ model does not need dtype to be specified,
39
  # it will be inferred from the config
40
  pass
@@ -55,8 +58,12 @@ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str,
55
  with open(json_filepath, "w") as f:
56
  f.write(json.dumps(data))
57
 
58
- api.upload_file(path_or_fileobj=json_filepath, path_in_repo=json_filepath.replace(local_dir, ""),
59
- repo_id=hf_repo, repo_type="dataset")
 
 
 
 
60
 
61
 
62
  def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_download: bool = True) -> list[EvalRequest]:
@@ -68,7 +75,9 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_downloa
68
  `list[EvalRequest]`: a list of model info dicts.
69
  """
70
  if do_download:
71
- my_snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60)
 
 
72
 
73
  json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
74
 
@@ -81,8 +90,8 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_downloa
81
  # breakpoint()
82
  data["json_filepath"] = json_filepath
83
 
84
- if 'job_id' in data:
85
- del data['job_id']
86
 
87
  eval_request = EvalRequest(**data)
88
  eval_requests.append(eval_request)
@@ -90,10 +99,20 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_downloa
90
  return eval_requests
91
 
92
 
93
- def check_completed_evals(api: HfApi, hf_repo: str, local_dir: str, checked_status: str, completed_status: str,
94
- failed_status: str, hf_repo_results: str, local_dir_results: str):
 
 
 
 
 
 
 
 
95
  """Checks if the currently running evals are completed, if yes, update their status on the hub."""
96
- my_snapshot_download(repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60)
 
 
97
 
98
  running_evals = get_eval_requests([checked_status], hf_repo=hf_repo, local_dir=local_dir)
99
 
@@ -109,5 +128,3 @@ def check_completed_evals(api: HfApi, hf_repo: str, local_dir: str, checked_stat
109
  if output_file_exists:
110
  print(f"EXISTS output file exists for {model} setting it to {completed_status}")
111
  set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
112
-
113
-
 
17
  weight_type: str = "Original"
18
  model_type: str = "" # pretrained, finetuned, with RL
19
  precision: str = "" # float16, bfloat16
20
+ base_model: Optional[str] = None # for adapter models
21
+ revision: str = "main" # commit
22
+ submitted_time: Optional[str] = (
23
+ "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
24
+ )
25
  model_type: Optional[str] = None
26
  likes: Optional[int] = 0
27
  params: Optional[int] = None
28
  license: Optional[str] = ""
29
+
30
  def get_model_args(self) -> str:
31
+ model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
32
 
33
  if self.precision in ["float16", "float32", "bfloat16"]:
34
  model_args += f",dtype={self.precision}"
35
+ # Quantized models need some added config, the install of bits and bytes, etc
36
+ # elif self.precision == "8bit":
37
+ # model_args += ",load_in_8bit=True"
38
+ # elif self.precision == "4bit":
39
+ # model_args += ",load_in_4bit=True"
40
+ # elif self.precision == "GPTQ":
41
  # A GPTQ model does not need dtype to be specified,
42
  # it will be inferred from the config
43
  pass
 
58
  with open(json_filepath, "w") as f:
59
  f.write(json.dumps(data))
60
 
61
+ api.upload_file(
62
+ path_or_fileobj=json_filepath,
63
+ path_in_repo=json_filepath.replace(local_dir, ""),
64
+ repo_id=hf_repo,
65
+ repo_type="dataset",
66
+ )
67
 
68
 
69
  def get_eval_requests(job_status: list, local_dir: str, hf_repo: str, do_download: bool = True) -> list[EvalRequest]:
 
75
  `list[EvalRequest]`: a list of model info dicts.
76
  """
77
  if do_download:
78
+ my_snapshot_download(
79
+ repo_id=hf_repo, revision="main", local_dir=local_dir, repo_type="dataset", max_workers=60
80
+ )
81
 
82
  json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
83
 
 
90
  # breakpoint()
91
  data["json_filepath"] = json_filepath
92
 
93
+ if "job_id" in data:
94
+ del data["job_id"]
95
 
96
  eval_request = EvalRequest(**data)
97
  eval_requests.append(eval_request)
 
99
  return eval_requests
100
 
101
 
102
+ def check_completed_evals(
103
+ api: HfApi,
104
+ hf_repo: str,
105
+ local_dir: str,
106
+ checked_status: str,
107
+ completed_status: str,
108
+ failed_status: str,
109
+ hf_repo_results: str,
110
+ local_dir_results: str,
111
+ ):
112
  """Checks if the currently running evals are completed, if yes, update their status on the hub."""
113
+ my_snapshot_download(
114
+ repo_id=hf_repo_results, revision="main", local_dir=local_dir_results, repo_type="dataset", max_workers=60
115
+ )
116
 
117
  running_evals = get_eval_requests([checked_status], hf_repo=hf_repo, local_dir=local_dir)
118
 
 
128
  if output_file_exists:
129
  print(f"EXISTS output file exists for {model} setting it to {completed_status}")
130
  set_eval_request(api, eval_request, completed_status, hf_repo, local_dir)
 
 
src/backend/moe_infinity.py CHANGED
@@ -8,17 +8,18 @@ from typing import List, Tuple, Optional, Union
8
  from lm_eval.models.huggingface import HFLM
9
  from lm_eval.api.registry import register_model
10
 
11
- @register_model('moe-infinity')
 
12
  class MoEHFLM(HFLM):
13
  def __init__(
14
  self,
15
  pretrained: str = "mistralai/Mixtral-8x7B-Instruct-v0.1",
16
  moe_config: dict = None,
17
- offload_path = os.path.expanduser('~'),
18
- device_memory_ratio = 0.75,
19
  use_chat_template=True,
20
  *args,
21
- **kwargs
22
  ):
23
  # Initialize parent class without calling _create_model in the parent's __init__
24
  self.checkpoint = pretrained
@@ -28,7 +29,9 @@ class MoEHFLM(HFLM):
28
  self.use_chat_template = use_chat_template
29
  if "device" in kwargs:
30
  kwargs.pop("device")
31
- super().__init__(*args, **kwargs, pretrained=pretrained, device_map="cuda:0") # Assuming HFLM accepts a 'pretrained' arg and handles it
 
 
32
  # self._create_model()
33
 
34
  def _create_model(self, *args, **kwargs):
@@ -43,7 +46,9 @@ class MoEHFLM(HFLM):
43
  # Update default config with any user-provided config
44
  final_moe_config = {**default_moe_config, **self.moe_config}
45
  # self._model = MoE(self.checkpoint, final_moe_config)
46
- self._model = AutoModelForCausalLM.from_pretrained(self.checkpoint, torch_dtype=torch.float16, device_map="auto")
 
 
47
 
48
  @property
49
  def max_length(self):
@@ -94,9 +99,7 @@ class MoEHFLM(HFLM):
94
  )
95
  if left_truncate_len:
96
  encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
97
- encoding["attention_mask"] = encoding["attention_mask"][
98
- :, -left_truncate_len:
99
- ]
100
  self.tokenizer.padding_side = old_padding_side
101
 
102
  return encoding["input_ids"], encoding["attention_mask"]
 
8
  from lm_eval.models.huggingface import HFLM
9
  from lm_eval.api.registry import register_model
10
 
11
+
12
+ @register_model("moe-infinity")
13
  class MoEHFLM(HFLM):
14
  def __init__(
15
  self,
16
  pretrained: str = "mistralai/Mixtral-8x7B-Instruct-v0.1",
17
  moe_config: dict = None,
18
+ offload_path=os.path.expanduser("~"),
19
+ device_memory_ratio=0.75,
20
  use_chat_template=True,
21
  *args,
22
+ **kwargs,
23
  ):
24
  # Initialize parent class without calling _create_model in the parent's __init__
25
  self.checkpoint = pretrained
 
29
  self.use_chat_template = use_chat_template
30
  if "device" in kwargs:
31
  kwargs.pop("device")
32
+ super().__init__(
33
+ *args, **kwargs, pretrained=pretrained, device_map="cuda:0"
34
+ ) # Assuming HFLM accepts a 'pretrained' arg and handles it
35
  # self._create_model()
36
 
37
  def _create_model(self, *args, **kwargs):
 
46
  # Update default config with any user-provided config
47
  final_moe_config = {**default_moe_config, **self.moe_config}
48
  # self._model = MoE(self.checkpoint, final_moe_config)
49
+ self._model = AutoModelForCausalLM.from_pretrained(
50
+ self.checkpoint, torch_dtype=torch.float16, device_map="auto"
51
+ )
52
 
53
  @property
54
  def max_length(self):
 
99
  )
100
  if left_truncate_len:
101
  encoding["input_ids"] = encoding["input_ids"][:, -left_truncate_len:]
102
+ encoding["attention_mask"] = encoding["attention_mask"][:, -left_truncate_len:]
 
 
103
  self.tokenizer.padding_side = old_padding_side
104
 
105
  return encoding["input_ids"], encoding["attention_mask"]
src/backend/run_eval_suite.py CHANGED
@@ -14,7 +14,17 @@ from src.backend.tasks.selfcheckgpt.task import SelfCheckGPT
14
  from src.backend.huggingface_generate_until import HFLMwithChatTemplate
15
  from src.backend.moe_infinity import MoEHFLM
16
 
17
- def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_size, device, use_cache=None, limit=None, max_nb_samples=100) -> dict:
 
 
 
 
 
 
 
 
 
 
18
  if limit:
19
  print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
20
 
@@ -33,30 +43,34 @@ def run_evaluation(eval_request: EvalRequest, task_names, num_fewshot, batch_siz
33
 
34
  print(f"Selected Tasks: {task_names}")
35
  print(f"Eval Request: {eval_request.get_model_args()}")
36
- print(f"Num Fewshot: {num_fewshot}, Batch Size: {batch_size}, Device: {device}, Use Cache: {use_cache}, Limit: {limit}")
 
 
37
  # hf-chat is implemented to use apply_chat_template
38
- results = evaluator.simple_evaluate(model="moe-infinity", # "hf-causal-experimental", # "hf-causal", hf-chat
39
- model_args=eval_request.get_model_args(),
40
- tasks=task_names,
41
- num_fewshot=num_fewshot,
42
- batch_size=batch_size,
43
- max_batch_size=8,
44
- device=device,
45
- use_cache=use_cache,
46
- limit=limit,
47
- write_out=True,
48
- task_manager=task_manager)
 
 
49
 
50
  results["config"]["model_dtype"] = eval_request.precision
51
  results["config"]["model_name"] = eval_request.model
52
  results["config"]["model_sha"] = eval_request.revision
53
 
54
  if max_nb_samples is not None:
55
- if 'samples' in results:
56
- samples = results['samples']
57
  for task_name in samples.keys():
58
  if len(samples[task_name]) > max_nb_samples:
59
- results['samples'][task_name] = results['samples'][task_name][:max_nb_samples]
60
 
61
  # print(evaluator.make_table(results))
62
 
 
14
  from src.backend.huggingface_generate_until import HFLMwithChatTemplate
15
  from src.backend.moe_infinity import MoEHFLM
16
 
17
+
18
+ def run_evaluation(
19
+ eval_request: EvalRequest,
20
+ task_names,
21
+ num_fewshot,
22
+ batch_size,
23
+ device,
24
+ use_cache=None,
25
+ limit=None,
26
+ max_nb_samples=100,
27
+ ) -> dict:
28
  if limit:
29
  print("WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.")
30
 
 
43
 
44
  print(f"Selected Tasks: {task_names}")
45
  print(f"Eval Request: {eval_request.get_model_args()}")
46
+ print(
47
+ f"Num Fewshot: {num_fewshot}, Batch Size: {batch_size}, Device: {device}, Use Cache: {use_cache}, Limit: {limit}"
48
+ )
49
  # hf-chat is implemented to use apply_chat_template
50
+ results = evaluator.simple_evaluate(
51
+ model="moe-infinity", # "hf-causal-experimental", # "hf-causal", hf-chat
52
+ model_args=eval_request.get_model_args(),
53
+ tasks=task_names,
54
+ num_fewshot=num_fewshot,
55
+ batch_size=batch_size,
56
+ max_batch_size=8,
57
+ device=device,
58
+ use_cache=use_cache,
59
+ limit=limit,
60
+ write_out=True,
61
+ task_manager=task_manager,
62
+ )
63
 
64
  results["config"]["model_dtype"] = eval_request.precision
65
  results["config"]["model_name"] = eval_request.model
66
  results["config"]["model_sha"] = eval_request.revision
67
 
68
  if max_nb_samples is not None:
69
+ if "samples" in results:
70
+ samples = results["samples"]
71
  for task_name in samples.keys():
72
  if len(samples[task_name]) > max_nb_samples:
73
+ results["samples"][task_name] = results["samples"][task_name][:max_nb_samples]
74
 
75
  # print(evaluator.make_table(results))
76
 
src/backend/tasks/cnndm/task.py CHANGED
@@ -1,5 +1,6 @@
1
  from lm_eval.api.task import ConfigurableTask
2
  from lm_eval.api.instance import Instance
 
3
  # from lm_eval.api.registry import register_task
4
  from lm_eval.api.metrics import mean
5
 
@@ -66,7 +67,7 @@ class CNNDM(ConfigurableTask):
66
  DATASET_NAME = "3.0.0"
67
 
68
  def __init__(self):
69
- super().__init__(config={'metadata': {'version': self.VERSION}})
70
  self.factkb_tokenizer = None
71
  self.factkb_model = None
72
  self.bert_score = None
@@ -74,12 +75,18 @@ class CNNDM(ConfigurableTask):
74
  def maybe_init_factkb(self):
75
  if self.factkb_tokenizer is None or self.factkb_model is None:
76
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
77
- self.factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True)
78
- self.factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2, device_map="auto")
 
 
 
 
 
79
 
80
  def maybe_init_bertscore(self):
81
  if self.bert_score is None:
82
  from evaluate import load
 
83
  self.bert_score = load("bertscore")
84
 
85
  def has_training_docs(self):
@@ -125,15 +132,7 @@ class CNNDM(ConfigurableTask):
125
  part of the document for `doc`.
126
  """
127
 
128
- return [
129
- Instance(
130
- request_type="generate_until",
131
- doc=doc,
132
- arguments=(ctx, {"until": ["\n"]}),
133
- idx=0,
134
- **kwargs
135
- )
136
- ]
137
 
138
  def process_results(self, doc, results):
139
  completion = results[0]
@@ -157,12 +156,16 @@ class CNNDM(ConfigurableTask):
157
 
158
  self.maybe_init_factkb()
159
  input_factkb = [[completion, document]]
160
- factkb_tokens = self.factkb_tokenizer(input_factkb, return_tensors="pt", padding="max_length", truncation=True).to(self.factkb_model.device)
 
 
161
  factkb_logits = self.factkb_model(**factkb_tokens).logits
162
  factkb_res = torch.softmax(factkb_logits, dim=1)
163
 
164
  self.maybe_init_bertscore()
165
- bert_score_res = self.bert_score.compute(predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en")
 
 
166
 
167
  res = {
168
  "rouge1": rouge1_scores[0],
@@ -171,7 +174,7 @@ class CNNDM(ConfigurableTask):
171
  "factKB": float(factkb_res[0][1]),
172
  "bertscore_precision": float(bert_score_res["precision"][0]),
173
  "bertscore_recall": float(bert_score_res["recall"][0]),
174
- "bertscore_f1": float(bert_score_res["f1"][0])
175
  }
176
 
177
  return res
@@ -182,7 +185,18 @@ class CNNDM(ConfigurableTask):
182
  A dictionary where keys are the names of submetrics and values are
183
  functions that aggregate a list of metrics
184
  """
185
- return {k: mean for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  def higher_is_better(self):
188
  """
@@ -190,5 +204,15 @@ class CNNDM(ConfigurableTask):
190
  A dictionary where keys are the names of submetrics and values are
191
  whether a higher value of the submetric is better
192
  """
193
- return {k: True for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
194
-
 
 
 
 
 
 
 
 
 
 
 
1
  from lm_eval.api.task import ConfigurableTask
2
  from lm_eval.api.instance import Instance
3
+
4
  # from lm_eval.api.registry import register_task
5
  from lm_eval.api.metrics import mean
6
 
 
67
  DATASET_NAME = "3.0.0"
68
 
69
  def __init__(self):
70
+ super().__init__(config={"metadata": {"version": self.VERSION}})
71
  self.factkb_tokenizer = None
72
  self.factkb_model = None
73
  self.bert_score = None
 
75
  def maybe_init_factkb(self):
76
  if self.factkb_tokenizer is None or self.factkb_model is None:
77
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
78
+
79
+ self.factkb_tokenizer = AutoTokenizer.from_pretrained(
80
+ "roberta-base", padding="max_length", truncation=True
81
+ )
82
+ self.factkb_model = AutoModelForSequenceClassification.from_pretrained(
83
+ "bunsenfeng/FactKB", num_labels=2, device_map="auto"
84
+ )
85
 
86
  def maybe_init_bertscore(self):
87
  if self.bert_score is None:
88
  from evaluate import load
89
+
90
  self.bert_score = load("bertscore")
91
 
92
  def has_training_docs(self):
 
132
  part of the document for `doc`.
133
  """
134
 
135
+ return [Instance(request_type="generate_until", doc=doc, arguments=(ctx, {"until": ["\n"]}), idx=0, **kwargs)]
 
 
 
 
 
 
 
 
136
 
137
  def process_results(self, doc, results):
138
  completion = results[0]
 
156
 
157
  self.maybe_init_factkb()
158
  input_factkb = [[completion, document]]
159
+ factkb_tokens = self.factkb_tokenizer(
160
+ input_factkb, return_tensors="pt", padding="max_length", truncation=True
161
+ ).to(self.factkb_model.device)
162
  factkb_logits = self.factkb_model(**factkb_tokens).logits
163
  factkb_res = torch.softmax(factkb_logits, dim=1)
164
 
165
  self.maybe_init_bertscore()
166
+ bert_score_res = self.bert_score.compute(
167
+ predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en"
168
+ )
169
 
170
  res = {
171
  "rouge1": rouge1_scores[0],
 
174
  "factKB": float(factkb_res[0][1]),
175
  "bertscore_precision": float(bert_score_res["precision"][0]),
176
  "bertscore_recall": float(bert_score_res["recall"][0]),
177
+ "bertscore_f1": float(bert_score_res["f1"][0]),
178
  }
179
 
180
  return res
 
185
  A dictionary where keys are the names of submetrics and values are
186
  functions that aggregate a list of metrics
187
  """
188
+ return {
189
+ k: mean
190
+ for k in [
191
+ "rouge1",
192
+ "rouge2",
193
+ "rougeL",
194
+ "factKB",
195
+ "bertscore_precision",
196
+ "bertscore_recall",
197
+ "bertscore_f1",
198
+ ]
199
+ }
200
 
201
  def higher_is_better(self):
202
  """
 
204
  A dictionary where keys are the names of submetrics and values are
205
  whether a higher value of the submetric is better
206
  """
207
+ return {
208
+ k: True
209
+ for k in [
210
+ "rouge1",
211
+ "rouge2",
212
+ "rougeL",
213
+ "factKB",
214
+ "bertscore_precision",
215
+ "bertscore_recall",
216
+ "bertscore_f1",
217
+ ]
218
+ }
src/backend/tasks/cnndm/task_v2.py CHANGED
@@ -1,5 +1,6 @@
1
  from lm_eval.api.task import ConfigurableTask
2
  from lm_eval.api.instance import Instance
 
3
  # from lm_eval.api.registry import register_task
4
  from lm_eval.api.metrics import mean
5
 
@@ -66,8 +67,12 @@ class CNNDMv2(ConfigurableTask):
66
  DATASET_NAME = "3.0.0"
67
 
68
  def __init__(self):
69
- super().__init__(config={'metadata': {'version': self.VERSION},
70
- 'generation_kwargs': {'do_sample': False, 'temperature': 0.0, 'until': ['\n', '\n\n']}})
 
 
 
 
71
  self.factkb_tokenizer = None
72
  self.factkb_model = None
73
  self.bert_score = None
@@ -75,12 +80,18 @@ class CNNDMv2(ConfigurableTask):
75
  def maybe_init_factkb(self):
76
  if self.factkb_tokenizer is None or self.factkb_model is None:
77
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
78
- self.factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True)
79
- self.factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2, device_map="auto")
 
 
 
 
 
80
 
81
  def maybe_init_bertscore(self):
82
  if self.bert_score is None:
83
  from evaluate import load
 
84
  self.bert_score = load("bertscore")
85
 
86
  def has_training_docs(self):
@@ -134,15 +145,7 @@ class CNNDMv2(ConfigurableTask):
134
  part of the document for `doc`.
135
  """
136
 
137
- return [
138
- Instance(
139
- request_type="generate_until",
140
- doc=doc,
141
- arguments=(ctx, {"until": ["\n"]}),
142
- idx=0,
143
- **kwargs
144
- )
145
- ]
146
 
147
  def process_results(self, doc, results):
148
  completion = results[0]
@@ -166,12 +169,16 @@ class CNNDMv2(ConfigurableTask):
166
 
167
  self.maybe_init_factkb()
168
  input_factkb = [[completion, document]]
169
- factkb_tokens = self.factkb_tokenizer(input_factkb, return_tensors="pt", padding="max_length", truncation=True).to(self.factkb_model.device)
 
 
170
  factkb_logits = self.factkb_model(**factkb_tokens).logits
171
  factkb_res = torch.softmax(factkb_logits, dim=1)
172
 
173
  self.maybe_init_bertscore()
174
- bert_score_res = self.bert_score.compute(predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en")
 
 
175
 
176
  res = {
177
  "rouge1": rouge1_scores[0],
@@ -180,7 +187,7 @@ class CNNDMv2(ConfigurableTask):
180
  "factKB": float(factkb_res[0][1]),
181
  "bertscore_precision": float(bert_score_res["precision"][0]),
182
  "bertscore_recall": float(bert_score_res["recall"][0]),
183
- "bertscore_f1": float(bert_score_res["f1"][0])
184
  }
185
 
186
  return res
@@ -191,7 +198,18 @@ class CNNDMv2(ConfigurableTask):
191
  A dictionary where keys are the names of submetrics and values are
192
  functions that aggregate a list of metrics
193
  """
194
- return {k: mean for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  def higher_is_better(self):
197
  """
@@ -199,5 +217,15 @@ class CNNDMv2(ConfigurableTask):
199
  A dictionary where keys are the names of submetrics and values are
200
  whether a higher value of the submetric is better
201
  """
202
- return {k: True for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
203
-
 
 
 
 
 
 
 
 
 
 
 
1
  from lm_eval.api.task import ConfigurableTask
2
  from lm_eval.api.instance import Instance
3
+
4
  # from lm_eval.api.registry import register_task
5
  from lm_eval.api.metrics import mean
6
 
 
67
  DATASET_NAME = "3.0.0"
68
 
69
  def __init__(self):
70
+ super().__init__(
71
+ config={
72
+ "metadata": {"version": self.VERSION},
73
+ "generation_kwargs": {"do_sample": False, "temperature": 0.0, "until": ["\n", "\n\n"]},
74
+ }
75
+ )
76
  self.factkb_tokenizer = None
77
  self.factkb_model = None
78
  self.bert_score = None
 
80
  def maybe_init_factkb(self):
81
  if self.factkb_tokenizer is None or self.factkb_model is None:
82
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
83
+
84
+ self.factkb_tokenizer = AutoTokenizer.from_pretrained(
85
+ "roberta-base", padding="max_length", truncation=True
86
+ )
87
+ self.factkb_model = AutoModelForSequenceClassification.from_pretrained(
88
+ "bunsenfeng/FactKB", num_labels=2, device_map="auto"
89
+ )
90
 
91
  def maybe_init_bertscore(self):
92
  if self.bert_score is None:
93
  from evaluate import load
94
+
95
  self.bert_score = load("bertscore")
96
 
97
  def has_training_docs(self):
 
145
  part of the document for `doc`.
146
  """
147
 
148
+ return [Instance(request_type="generate_until", doc=doc, arguments=(ctx, {"until": ["\n"]}), idx=0, **kwargs)]
 
 
 
 
 
 
 
 
149
 
150
  def process_results(self, doc, results):
151
  completion = results[0]
 
169
 
170
  self.maybe_init_factkb()
171
  input_factkb = [[completion, document]]
172
+ factkb_tokens = self.factkb_tokenizer(
173
+ input_factkb, return_tensors="pt", padding="max_length", truncation=True
174
+ ).to(self.factkb_model.device)
175
  factkb_logits = self.factkb_model(**factkb_tokens).logits
176
  factkb_res = torch.softmax(factkb_logits, dim=1)
177
 
178
  self.maybe_init_bertscore()
179
+ bert_score_res = self.bert_score.compute(
180
+ predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en"
181
+ )
182
 
183
  res = {
184
  "rouge1": rouge1_scores[0],
 
187
  "factKB": float(factkb_res[0][1]),
188
  "bertscore_precision": float(bert_score_res["precision"][0]),
189
  "bertscore_recall": float(bert_score_res["recall"][0]),
190
+ "bertscore_f1": float(bert_score_res["f1"][0]),
191
  }
192
 
193
  return res
 
198
  A dictionary where keys are the names of submetrics and values are
199
  functions that aggregate a list of metrics
200
  """
201
+ return {
202
+ k: mean
203
+ for k in [
204
+ "rouge1",
205
+ "rouge2",
206
+ "rougeL",
207
+ "factKB",
208
+ "bertscore_precision",
209
+ "bertscore_recall",
210
+ "bertscore_f1",
211
+ ]
212
+ }
213
 
214
  def higher_is_better(self):
215
  """
 
217
  A dictionary where keys are the names of submetrics and values are
218
  whether a higher value of the submetric is better
219
  """
220
+ return {
221
+ k: True
222
+ for k in [
223
+ "rouge1",
224
+ "rouge2",
225
+ "rougeL",
226
+ "factKB",
227
+ "bertscore_precision",
228
+ "bertscore_recall",
229
+ "bertscore_f1",
230
+ ]
231
+ }
src/backend/tasks/faithdial/utils.py CHANGED
@@ -1,15 +1,16 @@
1
  from typing import List, Union
 
2
  ValueType = Union[str, List[str]]
3
 
4
 
5
  def doc_to_text(doc: dict[str, ValueType]) -> str:
6
- history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc['history'])])
7
  doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["response"]}\n#Hallucinated#:'
8
  return doc_text
9
 
10
 
11
  def doc_to_text_v2(doc: dict[str, ValueType]) -> str:
12
- history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc['history'])])
13
  doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["original_response"]}\n#Hallucinated#:'
14
  return doc_text
15
 
 
1
  from typing import List, Union
2
+
3
  ValueType = Union[str, List[str]]
4
 
5
 
6
  def doc_to_text(doc: dict[str, ValueType]) -> str:
7
+ history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc["history"])])
8
  doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["response"]}\n#Hallucinated#:'
9
  return doc_text
10
 
11
 
12
  def doc_to_text_v2(doc: dict[str, ValueType]) -> str:
13
+ history_str = " ".join([f'[{"Human" if i % 2 == 0 else "Assistant"}] {m}' for i, m in enumerate(doc["history"])])
14
  doc_text = f'#Knowledge#: {doc["knowledge"]}\n#Dialogue History#: {history_str}\n#Response#: {doc["original_response"]}\n#Hallucinated#:'
15
  return doc_text
16
 
src/backend/tasks/halueval/utils.py CHANGED
@@ -83,13 +83,31 @@ You should try your best to determine if the summary contains non-factual or hal
83
 
84
  def doc_to_text_qa(doc: dict[str, str]) -> str:
85
  # prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
86
- doc_text = QA_INSTURCTIONS + "\n\n#Knowledge#: " + doc["knowledge"] + "\n#Question#: " + doc["question"] + "\n#Answer#: " + doc["answer"] + "\n#Your Judgement#:"
 
 
 
 
 
 
 
 
 
87
  return doc_text
88
 
89
 
90
  def doc_to_text_dialogue(doc: dict[str, str]) -> str:
91
  # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
92
- doc_text = DIALOGUE_INSTRUCTIONS + "\n\n#Knowledge#: " + doc["knowledge"] + "\n#Dialogue History#: " + doc["dialogue_history"] + "\n#Response#: " + doc["response"] + "\n#Your Judgement#:"
 
 
 
 
 
 
 
 
 
93
  return doc_text
94
 
95
 
@@ -103,7 +121,7 @@ def doc_to_text_summarization(doc: dict[str, str]) -> str:
103
 
104
 
105
  def doc_to_target(doc: dict[str, str]) -> str:
106
- return doc['hallucination']
107
 
108
 
109
  def compute_metrics(gold_answer: str, prediction: str) -> dict[str, float]:
 
83
 
84
  def doc_to_text_qa(doc: dict[str, str]) -> str:
85
  # prompt = instruction + "\n\n#Question#: " + question + "\n#Answer#: " + answer + "\n#Your Judgement#:"
86
+ doc_text = (
87
+ QA_INSTURCTIONS
88
+ + "\n\n#Knowledge#: "
89
+ + doc["knowledge"]
90
+ + "\n#Question#: "
91
+ + doc["question"]
92
+ + "\n#Answer#: "
93
+ + doc["answer"]
94
+ + "\n#Your Judgement#:"
95
+ )
96
  return doc_text
97
 
98
 
99
  def doc_to_text_dialogue(doc: dict[str, str]) -> str:
100
  # prompt = instruction + "\n\n#Dialogue History#: " + dialog + "\n#Response#: " + response + "\n#Your Judgement#:"
101
+ doc_text = (
102
+ DIALOGUE_INSTRUCTIONS
103
+ + "\n\n#Knowledge#: "
104
+ + doc["knowledge"]
105
+ + "\n#Dialogue History#: "
106
+ + doc["dialogue_history"]
107
+ + "\n#Response#: "
108
+ + doc["response"]
109
+ + "\n#Your Judgement#:"
110
+ )
111
  return doc_text
112
 
113
 
 
121
 
122
 
123
  def doc_to_target(doc: dict[str, str]) -> str:
124
+ return doc["hallucination"]
125
 
126
 
127
  def compute_metrics(gold_answer: str, prediction: str) -> dict[str, float]:
src/backend/tasks/selfcheckgpt/task.py CHANGED
@@ -3,6 +3,7 @@ from typing import Union, List
3
 
4
  from lm_eval.api.task import ConfigurableTask
5
  from lm_eval.api.instance import Instance
 
6
  # from lm_eval.api.registry import register_task
7
  from lm_eval.api.metrics import mean
8
 
@@ -17,26 +18,31 @@ class SelfCheckGPT(ConfigurableTask):
17
  VERSION = 0.0
18
  DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
19
  DATASET_NAME = None
20
- OUTPUT_TYPE = 'generate_until'
21
 
22
  def __init__(self):
23
- super().__init__(config={'metadata': {'version': self.VERSION}})
24
  # these end tokens are hard coded because of the current limitaion of the llm-eval.
25
  self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
26
  self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
27
- self.generation_kwargs_sampling = {"temperature": 0.99, "do_sample": True, "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"], "max_length": 512}
28
-
29
- self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNLI')
30
- self.selfcheckgpt_device = os.environ.get('SELFCHECKGPTDEVICE', DEVICE)
 
 
 
 
 
31
  self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
32
 
33
- if self.selfcheckgpt_type == 'SelfCheckNgram':
34
  self.selfcheckgpt = SelfCheckNgram(n=1)
35
- elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
36
  self.selfcheckgpt = SelfCheckBERTScore(rescale_with_baseline=True)
37
- elif self.selfcheckgpt_type == 'SelfCheckMQAG':
38
  self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
39
- elif self.selfcheckgpt_type == 'SelfCheckNLI':
40
  self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
41
  self.SelfCheckNLI_error_cnt = 0
42
 
@@ -53,10 +59,10 @@ class SelfCheckGPT(ConfigurableTask):
53
  return self.dataset["evaluation"]
54
 
55
  def doc_to_text(self, doc):
56
- if not hasattr(self, 'selfcheckgpt_nlp'):
57
  self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
58
 
59
- sentences = [x.text.strip() for x in self.selfcheckgpt_nlp(doc['wiki_bio_text']).sents]
60
  if len(sentences) < 2:
61
  raise ValueError("This wikipedia passage is too short for self-consistency check: {sentences}")
62
  # disscussed with Potsawee
@@ -65,18 +71,19 @@ class SelfCheckGPT(ConfigurableTask):
65
  return doc_text
66
 
67
  def doc_to_target(self, doc):
68
- answer = doc['wiki_bio_text']
69
  return answer
70
 
71
  def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
72
  arguments = (ctx, self.generation_kwargs)
73
  request_list = [
74
- Instance(request_type='generate_until', doc=doc, arguments=arguments, idx=0, **kwargs),
75
  ]
76
  sampling_arguments = (ctx, self.generation_kwargs_sampling)
77
- request_list.extend([
78
- Instance(request_type='generate_until', doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
79
- for idx in range(1, self.generation_kwargs_sampling_number+1)
 
80
  ]
81
  )
82
  return request_list
@@ -88,48 +95,53 @@ class SelfCheckGPT(ConfigurableTask):
88
 
89
  sentences = self.selfcheckgpt_nlp(response_temperature_0)
90
  sentences = [sent.text.strip() for sent in sentences.sents]
91
- if self.selfcheckgpt_type == 'SelfCheckNgram':
92
- selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, passage=response_temperature_0, sampled_passages=other_responses)
 
 
93
  return {
94
- 'avg-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_neg_logprob'],
95
- 'max-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_max_neg_logprob']
96
  }
97
 
98
- elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
99
  selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
100
- elif self.selfcheckgpt_type == 'SelfCheckMQAG':
101
  selfcheckgpt_scores = self.selfcheckgpt.predict(
102
  sentences=sentences,
103
  passage=response_temperature_0,
104
  sampled_passages=other_responses,
105
- num_questions_per_sent=5, # number of questions to be drawn
106
- scoring_method='bayes_with_alpha', # options = 'counting', 'bayes', 'bayes_with_alpha'
107
- beta1=0.8, beta2=0.8) # additional params depending on scoring_method
108
- elif self.selfcheckgpt_type == 'SelfCheckNLI':
 
 
109
  selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
110
 
111
  if len(selfcheckgpt_scores) < 2:
112
  # at least two sentences
113
  self.SelfCheckNLI_error_cnt += 1
114
- result = {
115
- 'avg-selfcheckgpt': 0.0,
116
- 'max-selfcheckgpt': 0.0
117
- }
118
 
119
  else:
120
- threshold = 0.7 # https://huggingface.co/blog/dhuynh95/automatic-hallucination-detection
121
  # passage is hallucianted if one sentence is hallucinated. It's very strict.
122
  selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
123
  # passage is hallucianted if average score of all sentences is hallucinated.
124
- selfcheckgpt_scores_avg = 0.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 1.0
125
- result = {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
 
 
126
 
127
  return result
128
 
129
- selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
 
 
130
  selfcheckgpt_scores_max = max(selfcheckgpt_scores)
131
 
132
- return {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
133
 
134
  def aggregation(self):
135
  """
 
3
 
4
  from lm_eval.api.task import ConfigurableTask
5
  from lm_eval.api.instance import Instance
6
+
7
  # from lm_eval.api.registry import register_task
8
  from lm_eval.api.metrics import mean
9
 
 
18
  VERSION = 0.0
19
  DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
20
  DATASET_NAME = None
21
+ OUTPUT_TYPE = "generate_until"
22
 
23
  def __init__(self):
24
+ super().__init__(config={"metadata": {"version": self.VERSION}})
25
  # these end tokens are hard coded because of the current limitaion of the llm-eval.
26
  self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
27
  self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
28
+ self.generation_kwargs_sampling = {
29
+ "temperature": 0.99,
30
+ "do_sample": True,
31
+ "until": ["\n\n", "<unk>", "<|im_end|>", "</s>"],
32
+ "max_length": 512,
33
+ }
34
+
35
+ self.selfcheckgpt_type = os.environ.get("SELFCHECKGPTTYPE", "SelfCheckNLI")
36
+ self.selfcheckgpt_device = os.environ.get("SELFCHECKGPTDEVICE", DEVICE)
37
  self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
38
 
39
+ if self.selfcheckgpt_type == "SelfCheckNgram":
40
  self.selfcheckgpt = SelfCheckNgram(n=1)
41
+ elif self.selfcheckgpt_type == "SelfCheckBERTScore":
42
  self.selfcheckgpt = SelfCheckBERTScore(rescale_with_baseline=True)
43
+ elif self.selfcheckgpt_type == "SelfCheckMQAG":
44
  self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
45
+ elif self.selfcheckgpt_type == "SelfCheckNLI":
46
  self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
47
  self.SelfCheckNLI_error_cnt = 0
48
 
 
59
  return self.dataset["evaluation"]
60
 
61
  def doc_to_text(self, doc):
62
+ if not hasattr(self, "selfcheckgpt_nlp"):
63
  self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
64
 
65
+ sentences = [x.text.strip() for x in self.selfcheckgpt_nlp(doc["wiki_bio_text"]).sents]
66
  if len(sentences) < 2:
67
  raise ValueError("This wikipedia passage is too short for self-consistency check: {sentences}")
68
  # disscussed with Potsawee
 
71
  return doc_text
72
 
73
  def doc_to_target(self, doc):
74
+ answer = doc["wiki_bio_text"]
75
  return answer
76
 
77
  def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
78
  arguments = (ctx, self.generation_kwargs)
79
  request_list = [
80
+ Instance(request_type="generate_until", doc=doc, arguments=arguments, idx=0, **kwargs),
81
  ]
82
  sampling_arguments = (ctx, self.generation_kwargs_sampling)
83
+ request_list.extend(
84
+ [
85
+ Instance(request_type="generate_until", doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
86
+ for idx in range(1, self.generation_kwargs_sampling_number + 1)
87
  ]
88
  )
89
  return request_list
 
95
 
96
  sentences = self.selfcheckgpt_nlp(response_temperature_0)
97
  sentences = [sent.text.strip() for sent in sentences.sents]
98
+ if self.selfcheckgpt_type == "SelfCheckNgram":
99
+ selfcheckgpt_scores = self.selfcheckgpt.predict(
100
+ sentences=sentences, passage=response_temperature_0, sampled_passages=other_responses
101
+ )
102
  return {
103
+ "avg-selfcheckgpt": selfcheckgpt_scores["doc_level"]["avg_neg_logprob"],
104
+ "max-selfcheckgpt": selfcheckgpt_scores["doc_level"]["avg_max_neg_logprob"],
105
  }
106
 
107
+ elif self.selfcheckgpt_type == "SelfCheckBERTScore":
108
  selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
109
+ elif self.selfcheckgpt_type == "SelfCheckMQAG":
110
  selfcheckgpt_scores = self.selfcheckgpt.predict(
111
  sentences=sentences,
112
  passage=response_temperature_0,
113
  sampled_passages=other_responses,
114
+ num_questions_per_sent=5, # number of questions to be drawn
115
+ scoring_method="bayes_with_alpha", # options = 'counting', 'bayes', 'bayes_with_alpha'
116
+ beta1=0.8,
117
+ beta2=0.8,
118
+ ) # additional params depending on scoring_method
119
+ elif self.selfcheckgpt_type == "SelfCheckNLI":
120
  selfcheckgpt_scores = self.selfcheckgpt.predict(sentences=sentences, sampled_passages=other_responses)
121
 
122
  if len(selfcheckgpt_scores) < 2:
123
  # at least two sentences
124
  self.SelfCheckNLI_error_cnt += 1
125
+ result = {"avg-selfcheckgpt": 0.0, "max-selfcheckgpt": 0.0}
 
 
 
126
 
127
  else:
128
+ threshold = 0.7 # https://huggingface.co/blog/dhuynh95/automatic-hallucination-detection
129
  # passage is hallucianted if one sentence is hallucinated. It's very strict.
130
  selfcheckgpt_scores_max = 0.0 if max(selfcheckgpt_scores) > threshold else 1.0
131
  # passage is hallucianted if average score of all sentences is hallucinated.
132
+ selfcheckgpt_scores_avg = (
133
+ 0.0 if sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) > threshold else 1.0
134
+ )
135
+ result = {"avg-selfcheckgpt": selfcheckgpt_scores_avg, "max-selfcheckgpt": selfcheckgpt_scores_max}
136
 
137
  return result
138
 
139
+ selfcheckgpt_scores_avg = (
140
+ sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
141
+ )
142
  selfcheckgpt_scores_max = max(selfcheckgpt_scores)
143
 
144
+ return {"avg-selfcheckgpt": selfcheckgpt_scores_avg, "max-selfcheckgpt": selfcheckgpt_scores_max}
145
 
146
  def aggregation(self):
147
  """
src/backend/tasks/xsum/task.py CHANGED
@@ -1,5 +1,6 @@
1
  from lm_eval.api.task import ConfigurableTask
2
  from lm_eval.api.instance import Instance
 
3
  # from lm_eval.api.registry import register_task
4
  from lm_eval.api.metrics import mean
5
 
@@ -18,8 +19,16 @@ def bleu(refs, preds):
18
  :param preds:
19
  A `list` of predicted `str`s.
20
  """
21
- score = sacrebleu.corpus_bleu(preds, refs, smooth_method="exp", smooth_value=0.0, force=False,
22
- lowercase=False, tokenize="intl", use_effective_order=False).score
 
 
 
 
 
 
 
 
23
  return score
24
 
25
 
@@ -58,7 +67,7 @@ class XSum(ConfigurableTask):
58
  DATASET_NAME = None
59
 
60
  def __init__(self):
61
- super().__init__(config={'metadata': {'version': self.VERSION}})
62
  self.factkb_tokenizer = None
63
  self.factkb_model = None
64
  self.bert_score = None
@@ -66,12 +75,18 @@ class XSum(ConfigurableTask):
66
  def maybe_init_factkb(self):
67
  if self.factkb_tokenizer is None or self.factkb_model is None:
68
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
69
- self.factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True)
70
- self.factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2, device_map="auto")
 
 
 
 
 
71
 
72
  def maybe_init_bertscore(self):
73
  if self.bert_score is None:
74
  from evaluate import load
 
75
  self.bert_score = load("bertscore")
76
 
77
  def has_training_docs(self):
@@ -124,7 +139,7 @@ class XSum(ConfigurableTask):
124
  # arguments=(ctx, {"until": ["\n", "."]}),
125
  arguments=(ctx, {"until": ["\n"]}),
126
  idx=0,
127
- **kwargs
128
  )
129
  ]
130
 
@@ -150,12 +165,16 @@ class XSum(ConfigurableTask):
150
 
151
  self.maybe_init_factkb()
152
  input_factkb = [[completion, document]]
153
- factkb_tokens = self.factkb_tokenizer(input_factkb, return_tensors="pt", padding="max_length", truncation=True).to(self.factkb_model.device)
 
 
154
  factkb_logits = self.factkb_model(**factkb_tokens).logits
155
  factkb_res = torch.softmax(factkb_logits, dim=1)
156
 
157
  self.maybe_init_bertscore()
158
- bert_score_res = self.bert_score.compute(predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en")
 
 
159
 
160
  res = {
161
  "rouge1": rouge1_scores[0],
@@ -177,7 +196,18 @@ class XSum(ConfigurableTask):
177
  A dictionary where keys are the names of submetrics and values are
178
  functions that aggregate a list of metrics
179
  """
180
- return {k: mean for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
 
 
 
 
 
 
 
 
 
 
 
181
 
182
  def higher_is_better(self):
183
  """
@@ -185,4 +215,15 @@ class XSum(ConfigurableTask):
185
  A dictionary where keys are the names of submetrics and values are
186
  whether a higher value of the submetric is better
187
  """
188
- return {k: True for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
 
 
 
 
 
 
 
 
 
 
 
 
1
  from lm_eval.api.task import ConfigurableTask
2
  from lm_eval.api.instance import Instance
3
+
4
  # from lm_eval.api.registry import register_task
5
  from lm_eval.api.metrics import mean
6
 
 
19
  :param preds:
20
  A `list` of predicted `str`s.
21
  """
22
+ score = sacrebleu.corpus_bleu(
23
+ preds,
24
+ refs,
25
+ smooth_method="exp",
26
+ smooth_value=0.0,
27
+ force=False,
28
+ lowercase=False,
29
+ tokenize="intl",
30
+ use_effective_order=False,
31
+ ).score
32
  return score
33
 
34
 
 
67
  DATASET_NAME = None
68
 
69
  def __init__(self):
70
+ super().__init__(config={"metadata": {"version": self.VERSION}})
71
  self.factkb_tokenizer = None
72
  self.factkb_model = None
73
  self.bert_score = None
 
75
  def maybe_init_factkb(self):
76
  if self.factkb_tokenizer is None or self.factkb_model is None:
77
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
78
+
79
+ self.factkb_tokenizer = AutoTokenizer.from_pretrained(
80
+ "roberta-base", padding="max_length", truncation=True
81
+ )
82
+ self.factkb_model = AutoModelForSequenceClassification.from_pretrained(
83
+ "bunsenfeng/FactKB", num_labels=2, device_map="auto"
84
+ )
85
 
86
  def maybe_init_bertscore(self):
87
  if self.bert_score is None:
88
  from evaluate import load
89
+
90
  self.bert_score = load("bertscore")
91
 
92
  def has_training_docs(self):
 
139
  # arguments=(ctx, {"until": ["\n", "."]}),
140
  arguments=(ctx, {"until": ["\n"]}),
141
  idx=0,
142
+ **kwargs,
143
  )
144
  ]
145
 
 
165
 
166
  self.maybe_init_factkb()
167
  input_factkb = [[completion, document]]
168
+ factkb_tokens = self.factkb_tokenizer(
169
+ input_factkb, return_tensors="pt", padding="max_length", truncation=True
170
+ ).to(self.factkb_model.device)
171
  factkb_logits = self.factkb_model(**factkb_tokens).logits
172
  factkb_res = torch.softmax(factkb_logits, dim=1)
173
 
174
  self.maybe_init_bertscore()
175
+ bert_score_res = self.bert_score.compute(
176
+ predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en"
177
+ )
178
 
179
  res = {
180
  "rouge1": rouge1_scores[0],
 
196
  A dictionary where keys are the names of submetrics and values are
197
  functions that aggregate a list of metrics
198
  """
199
+ return {
200
+ k: mean
201
+ for k in [
202
+ "rouge1",
203
+ "rouge2",
204
+ "rougeL",
205
+ "factKB",
206
+ "bertscore_precision",
207
+ "bertscore_recall",
208
+ "bertscore_f1",
209
+ ]
210
+ }
211
 
212
  def higher_is_better(self):
213
  """
 
215
  A dictionary where keys are the names of submetrics and values are
216
  whether a higher value of the submetric is better
217
  """
218
+ return {
219
+ k: True
220
+ for k in [
221
+ "rouge1",
222
+ "rouge2",
223
+ "rougeL",
224
+ "factKB",
225
+ "bertscore_precision",
226
+ "bertscore_recall",
227
+ "bertscore_f1",
228
+ ]
229
+ }
src/backend/tasks/xsum/task_v2.py CHANGED
@@ -1,5 +1,6 @@
1
  from lm_eval.api.task import ConfigurableTask
2
  from lm_eval.api.instance import Instance
 
3
  # from lm_eval.api.registry import register_task
4
  from lm_eval.api.metrics import mean
5
 
@@ -18,8 +19,16 @@ def bleu(refs, preds):
18
  :param preds:
19
  A `list` of predicted `str`s.
20
  """
21
- score = sacrebleu.corpus_bleu(preds, refs, smooth_method="exp", smooth_value=0.0, force=False,
22
- lowercase=False, tokenize="intl", use_effective_order=False).score
 
 
 
 
 
 
 
 
23
  return score
24
 
25
 
@@ -59,8 +68,12 @@ class XSumv2(ConfigurableTask):
59
 
60
  def __init__(self):
61
  # breakpoint()
62
- super().__init__(config={'metadata': {'version': self.VERSION},
63
- 'generation_kwargs': {'do_sample': False, 'temperature': 0.0, 'until': ['\n', '\n\n']}})
 
 
 
 
64
  self.factkb_tokenizer = None
65
  self.factkb_model = None
66
  self.bert_score = None
@@ -68,12 +81,18 @@ class XSumv2(ConfigurableTask):
68
  def maybe_init_factkb(self):
69
  if self.factkb_tokenizer is None or self.factkb_model is None:
70
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
71
- self.factkb_tokenizer = AutoTokenizer.from_pretrained("roberta-base", padding="max_length", truncation=True)
72
- self.factkb_model = AutoModelForSequenceClassification.from_pretrained("bunsenfeng/FactKB", num_labels=2, device_map="auto")
 
 
 
 
 
73
 
74
  def maybe_init_bertscore(self):
75
  if self.bert_score is None:
76
  from evaluate import load
 
77
  self.bert_score = load("bertscore")
78
 
79
  def has_training_docs(self):
@@ -129,7 +148,7 @@ class XSumv2(ConfigurableTask):
129
  # arguments=(ctx, {"until": ["\n", "."]}),
130
  arguments=(ctx, {"until": ["\n"]}),
131
  idx=0,
132
- **kwargs
133
  )
134
  ]
135
 
@@ -155,12 +174,16 @@ class XSumv2(ConfigurableTask):
155
 
156
  self.maybe_init_factkb()
157
  input_factkb = [[completion, document]]
158
- factkb_tokens = self.factkb_tokenizer(input_factkb, return_tensors="pt", padding="max_length", truncation=True).to(self.factkb_model.device)
 
 
159
  factkb_logits = self.factkb_model(**factkb_tokens).logits
160
  factkb_res = torch.softmax(factkb_logits, dim=1)
161
 
162
  self.maybe_init_bertscore()
163
- bert_score_res = self.bert_score.compute(predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en")
 
 
164
 
165
  res = {
166
  "rouge1": rouge1_scores[0],
@@ -182,7 +205,18 @@ class XSumv2(ConfigurableTask):
182
  A dictionary where keys are the names of submetrics and values are
183
  functions that aggregate a list of metrics
184
  """
185
- return {k: mean for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  def higher_is_better(self):
188
  """
@@ -190,4 +224,15 @@ class XSumv2(ConfigurableTask):
190
  A dictionary where keys are the names of submetrics and values are
191
  whether a higher value of the submetric is better
192
  """
193
- return {k: True for k in ["rouge1", "rouge2", "rougeL", "factKB", "bertscore_precision", "bertscore_recall", "bertscore_f1"]}
 
 
 
 
 
 
 
 
 
 
 
 
1
  from lm_eval.api.task import ConfigurableTask
2
  from lm_eval.api.instance import Instance
3
+
4
  # from lm_eval.api.registry import register_task
5
  from lm_eval.api.metrics import mean
6
 
 
19
  :param preds:
20
  A `list` of predicted `str`s.
21
  """
22
+ score = sacrebleu.corpus_bleu(
23
+ preds,
24
+ refs,
25
+ smooth_method="exp",
26
+ smooth_value=0.0,
27
+ force=False,
28
+ lowercase=False,
29
+ tokenize="intl",
30
+ use_effective_order=False,
31
+ ).score
32
  return score
33
 
34
 
 
68
 
69
  def __init__(self):
70
  # breakpoint()
71
+ super().__init__(
72
+ config={
73
+ "metadata": {"version": self.VERSION},
74
+ "generation_kwargs": {"do_sample": False, "temperature": 0.0, "until": ["\n", "\n\n"]},
75
+ }
76
+ )
77
  self.factkb_tokenizer = None
78
  self.factkb_model = None
79
  self.bert_score = None
 
81
  def maybe_init_factkb(self):
82
  if self.factkb_tokenizer is None or self.factkb_model is None:
83
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
84
+
85
+ self.factkb_tokenizer = AutoTokenizer.from_pretrained(
86
+ "roberta-base", padding="max_length", truncation=True
87
+ )
88
+ self.factkb_model = AutoModelForSequenceClassification.from_pretrained(
89
+ "bunsenfeng/FactKB", num_labels=2, device_map="auto"
90
+ )
91
 
92
  def maybe_init_bertscore(self):
93
  if self.bert_score is None:
94
  from evaluate import load
95
+
96
  self.bert_score = load("bertscore")
97
 
98
  def has_training_docs(self):
 
148
  # arguments=(ctx, {"until": ["\n", "."]}),
149
  arguments=(ctx, {"until": ["\n"]}),
150
  idx=0,
151
+ **kwargs,
152
  )
153
  ]
154
 
 
174
 
175
  self.maybe_init_factkb()
176
  input_factkb = [[completion, document]]
177
+ factkb_tokens = self.factkb_tokenizer(
178
+ input_factkb, return_tensors="pt", padding="max_length", truncation=True
179
+ ).to(self.factkb_model.device)
180
  factkb_logits = self.factkb_model(**factkb_tokens).logits
181
  factkb_res = torch.softmax(factkb_logits, dim=1)
182
 
183
  self.maybe_init_bertscore()
184
+ bert_score_res = self.bert_score.compute(
185
+ predictions=[completion], references=[gold_summary], model_type="microsoft/deberta-xlarge-mnli", lang="en"
186
+ )
187
 
188
  res = {
189
  "rouge1": rouge1_scores[0],
 
205
  A dictionary where keys are the names of submetrics and values are
206
  functions that aggregate a list of metrics
207
  """
208
+ return {
209
+ k: mean
210
+ for k in [
211
+ "rouge1",
212
+ "rouge2",
213
+ "rougeL",
214
+ "factKB",
215
+ "bertscore_precision",
216
+ "bertscore_recall",
217
+ "bertscore_f1",
218
+ ]
219
+ }
220
 
221
  def higher_is_better(self):
222
  """
 
224
  A dictionary where keys are the names of submetrics and values are
225
  whether a higher value of the submetric is better
226
  """
227
+ return {
228
+ k: True
229
+ for k in [
230
+ "rouge1",
231
+ "rouge2",
232
+ "rougeL",
233
+ "factKB",
234
+ "bertscore_precision",
235
+ "bertscore_recall",
236
+ "bertscore_f1",
237
+ ]
238
+ }
src/browse.py CHANGED
@@ -32,6 +32,7 @@ import socket
32
  import subprocess
33
  import sys
34
  import webbrowser
 
35
  if sys.version_info >= (3, 2):
36
  from html import escape
37
  else:
@@ -42,7 +43,7 @@ except ImportError:
42
  from urllib2 import unquote
43
  from collections import namedtuple
44
 
45
- Node = namedtuple('Node', ['inputs', 'rule', 'target', 'outputs'])
46
 
47
  # Ideally we'd allow you to navigate to a build edge or a build node,
48
  # with appropriate views for each. But there's no way to *name* a build
@@ -57,16 +58,19 @@ Node = namedtuple('Node', ['inputs', 'rule', 'target', 'outputs'])
57
  # This means there's no single view that shows you all inputs and outputs
58
  # of an edge. But I think it's less confusing than alternatives.
59
 
 
60
  def match_strip(line, prefix):
61
  if not line.startswith(prefix):
62
  return (False, line)
63
- return (True, line[len(prefix):])
 
64
 
65
  def html_escape(text):
66
  return escape(text, quote=True)
67
 
 
68
  def parse(text):
69
- lines = iter(text.split('\n'))
70
 
71
  target = None
72
  rule = None
@@ -77,33 +81,35 @@ def parse(text):
77
  target = next(lines)[:-1] # strip trailing colon
78
 
79
  line = next(lines)
80
- (match, rule) = match_strip(line, ' input: ')
81
  if match:
82
- (match, line) = match_strip(next(lines), ' ')
83
  while match:
84
  type = None
85
- (match, line) = match_strip(line, '| ')
86
  if match:
87
- type = 'implicit'
88
- (match, line) = match_strip(line, '|| ')
89
  if match:
90
- type = 'order-only'
91
  inputs.append((line, type))
92
- (match, line) = match_strip(next(lines), ' ')
93
 
94
- match, _ = match_strip(line, ' outputs:')
95
  if match:
96
- (match, line) = match_strip(next(lines), ' ')
97
  while match:
98
  outputs.append(line)
99
- (match, line) = match_strip(next(lines), ' ')
100
  except StopIteration:
101
  pass
102
 
103
  return Node(inputs, rule, target, outputs)
104
 
 
105
  def create_page(body):
106
- return '''<!DOCTYPE html>
 
107
  <style>
108
  body {
109
  font-family: sans;
@@ -128,52 +134,55 @@ tt {
128
  -webkit-columns: auto 2;
129
  }
130
  </style>
131
- ''' + body
 
 
 
132
 
133
  def generate_html(node):
134
- document = ['<h1><tt>%s</tt></h1>' % html_escape(node.target)]
135
 
136
  if node.inputs:
137
- document.append('<h2>target is built using rule <tt>%s</tt> of</h2>' %
138
- html_escape(node.rule))
139
  if len(node.inputs) > 0:
140
- document.append('<div class=filelist>')
141
  for input, type in sorted(node.inputs):
142
- extra = ''
143
  if type:
144
- extra = ' (%s)' % html_escape(type)
145
- document.append('<tt><a href="?%s">%s</a>%s</tt><br>' %
146
- (html_escape(input), html_escape(input), extra))
147
- document.append('</div>')
 
148
 
149
  if node.outputs:
150
- document.append('<h2>dependent edges build:</h2>')
151
- document.append('<div class=filelist>')
152
  for output in sorted(node.outputs):
153
- document.append('<tt><a href="?%s">%s</a></tt><br>' %
154
- (html_escape(output), html_escape(output)))
155
- document.append('</div>')
 
156
 
157
- return '\n'.join(document)
158
 
159
  def ninja_dump(target):
160
- cmd = [args.ninja_command, '-f', args.f, '-t', 'query', target]
161
- proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
162
- universal_newlines=True)
163
  return proc.communicate() + (proc.returncode,)
164
 
 
165
  class RequestHandler(httpserver.BaseHTTPRequestHandler):
166
  def do_GET(self):
167
- assert self.path[0] == '/'
168
  target = unquote(self.path[1:])
169
 
170
- if target == '':
171
  self.send_response(302)
172
- self.send_header('Location', '?' + args.initial_target)
173
  self.end_headers()
174
  return
175
 
176
- if not target.startswith('?'):
177
  self.send_response(404)
178
  self.end_headers()
179
  return
@@ -184,48 +193,45 @@ class RequestHandler(httpserver.BaseHTTPRequestHandler):
184
  page_body = generate_html(parse(ninja_output.strip()))
185
  else:
186
  # Relay ninja's error message.
187
- page_body = '<h1><tt>%s</tt></h1>' % html_escape(ninja_error)
188
 
189
  self.send_response(200)
190
  self.end_headers()
191
- self.wfile.write(create_page(page_body).encode('utf-8'))
192
 
193
  def log_message(self, format, *args):
194
  pass # Swallow console spam.
195
 
196
- parser = argparse.ArgumentParser(prog='ninja -t browse')
197
- parser.add_argument('--port', '-p', default=8000, type=int,
198
- help='Port number to use (default %(default)d)')
199
- parser.add_argument('--hostname', '-a', default='localhost', type=str,
200
- help='Hostname to bind to (default %(default)s)')
201
- parser.add_argument('--no-browser', action='store_true',
202
- help='Do not open a webbrowser on startup.')
203
-
204
- parser.add_argument('--ninja-command', default='ninja',
205
- help='Path to ninja binary (default %(default)s)')
206
- parser.add_argument('-f', default='build.ninja',
207
- help='Path to build.ninja file (default %(default)s)')
208
- parser.add_argument('initial_target', default='all', nargs='?',
209
- help='Initial target to show (default %(default)s)')
210
 
211
  class HTTPServer(socketserver.ThreadingMixIn, httpserver.HTTPServer):
212
  # terminate server immediately when Python exits.
213
  daemon_threads = True
214
 
 
215
  args = parser.parse_args()
216
  port = args.port
217
  hostname = args.hostname
218
- httpd = HTTPServer((hostname,port), RequestHandler)
219
  try:
220
  if hostname == "":
221
  hostname = socket.gethostname()
222
- print('Web server running on %s:%d, ctl-C to abort...' % (hostname,port) )
223
- print('Web server pid %d' % os.getpid(), file=sys.stderr )
224
  if not args.no_browser:
225
- webbrowser.open_new('http://%s:%s' % (hostname, port) )
226
  httpd.serve_forever()
227
  except KeyboardInterrupt:
228
  print()
229
  pass # Swallow console spam.
230
-
231
-
 
32
  import subprocess
33
  import sys
34
  import webbrowser
35
+
36
  if sys.version_info >= (3, 2):
37
  from html import escape
38
  else:
 
43
  from urllib2 import unquote
44
  from collections import namedtuple
45
 
46
+ Node = namedtuple("Node", ["inputs", "rule", "target", "outputs"])
47
 
48
  # Ideally we'd allow you to navigate to a build edge or a build node,
49
  # with appropriate views for each. But there's no way to *name* a build
 
58
  # This means there's no single view that shows you all inputs and outputs
59
  # of an edge. But I think it's less confusing than alternatives.
60
 
61
+
62
  def match_strip(line, prefix):
63
  if not line.startswith(prefix):
64
  return (False, line)
65
+ return (True, line[len(prefix) :])
66
+
67
 
68
  def html_escape(text):
69
  return escape(text, quote=True)
70
 
71
+
72
  def parse(text):
73
+ lines = iter(text.split("\n"))
74
 
75
  target = None
76
  rule = None
 
81
  target = next(lines)[:-1] # strip trailing colon
82
 
83
  line = next(lines)
84
+ (match, rule) = match_strip(line, " input: ")
85
  if match:
86
+ (match, line) = match_strip(next(lines), " ")
87
  while match:
88
  type = None
89
+ (match, line) = match_strip(line, "| ")
90
  if match:
91
+ type = "implicit"
92
+ (match, line) = match_strip(line, "|| ")
93
  if match:
94
+ type = "order-only"
95
  inputs.append((line, type))
96
+ (match, line) = match_strip(next(lines), " ")
97
 
98
+ match, _ = match_strip(line, " outputs:")
99
  if match:
100
+ (match, line) = match_strip(next(lines), " ")
101
  while match:
102
  outputs.append(line)
103
+ (match, line) = match_strip(next(lines), " ")
104
  except StopIteration:
105
  pass
106
 
107
  return Node(inputs, rule, target, outputs)
108
 
109
+
110
  def create_page(body):
111
+ return (
112
+ """<!DOCTYPE html>
113
  <style>
114
  body {
115
  font-family: sans;
 
134
  -webkit-columns: auto 2;
135
  }
136
  </style>
137
+ """
138
+ + body
139
+ )
140
+
141
 
142
  def generate_html(node):
143
+ document = ["<h1><tt>%s</tt></h1>" % html_escape(node.target)]
144
 
145
  if node.inputs:
146
+ document.append("<h2>target is built using rule <tt>%s</tt> of</h2>" % html_escape(node.rule))
 
147
  if len(node.inputs) > 0:
148
+ document.append("<div class=filelist>")
149
  for input, type in sorted(node.inputs):
150
+ extra = ""
151
  if type:
152
+ extra = " (%s)" % html_escape(type)
153
+ document.append(
154
+ '<tt><a href="?%s">%s</a>%s</tt><br>' % (html_escape(input), html_escape(input), extra)
155
+ )
156
+ document.append("</div>")
157
 
158
  if node.outputs:
159
+ document.append("<h2>dependent edges build:</h2>")
160
+ document.append("<div class=filelist>")
161
  for output in sorted(node.outputs):
162
+ document.append('<tt><a href="?%s">%s</a></tt><br>' % (html_escape(output), html_escape(output)))
163
+ document.append("</div>")
164
+
165
+ return "\n".join(document)
166
 
 
167
 
168
  def ninja_dump(target):
169
+ cmd = [args.ninja_command, "-f", args.f, "-t", "query", target]
170
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
 
171
  return proc.communicate() + (proc.returncode,)
172
 
173
+
174
  class RequestHandler(httpserver.BaseHTTPRequestHandler):
175
  def do_GET(self):
176
+ assert self.path[0] == "/"
177
  target = unquote(self.path[1:])
178
 
179
+ if target == "":
180
  self.send_response(302)
181
+ self.send_header("Location", "?" + args.initial_target)
182
  self.end_headers()
183
  return
184
 
185
+ if not target.startswith("?"):
186
  self.send_response(404)
187
  self.end_headers()
188
  return
 
193
  page_body = generate_html(parse(ninja_output.strip()))
194
  else:
195
  # Relay ninja's error message.
196
+ page_body = "<h1><tt>%s</tt></h1>" % html_escape(ninja_error)
197
 
198
  self.send_response(200)
199
  self.end_headers()
200
+ self.wfile.write(create_page(page_body).encode("utf-8"))
201
 
202
  def log_message(self, format, *args):
203
  pass # Swallow console spam.
204
 
205
+
206
+ parser = argparse.ArgumentParser(prog="ninja -t browse")
207
+ parser.add_argument("--port", "-p", default=8000, type=int, help="Port number to use (default %(default)d)")
208
+ parser.add_argument(
209
+ "--hostname", "-a", default="localhost", type=str, help="Hostname to bind to (default %(default)s)"
210
+ )
211
+ parser.add_argument("--no-browser", action="store_true", help="Do not open a webbrowser on startup.")
212
+
213
+ parser.add_argument("--ninja-command", default="ninja", help="Path to ninja binary (default %(default)s)")
214
+ parser.add_argument("-f", default="build.ninja", help="Path to build.ninja file (default %(default)s)")
215
+ parser.add_argument("initial_target", default="all", nargs="?", help="Initial target to show (default %(default)s)")
216
+
 
 
217
 
218
  class HTTPServer(socketserver.ThreadingMixIn, httpserver.HTTPServer):
219
  # terminate server immediately when Python exits.
220
  daemon_threads = True
221
 
222
+
223
  args = parser.parse_args()
224
  port = args.port
225
  hostname = args.hostname
226
+ httpd = HTTPServer((hostname, port), RequestHandler)
227
  try:
228
  if hostname == "":
229
  hostname = socket.gethostname()
230
+ print("Web server running on %s:%d, ctl-C to abort..." % (hostname, port))
231
+ print("Web server pid %d" % os.getpid(), file=sys.stderr)
232
  if not args.no_browser:
233
+ webbrowser.open_new("http://%s:%s" % (hostname, port))
234
  httpd.serve_forever()
235
  except KeyboardInterrupt:
236
  print()
237
  pass # Swallow console spam.
 
 
src/display/utils.py CHANGED
@@ -61,6 +61,7 @@ class ColumnContent:
61
  never_hidden: bool = False
62
  dummy: bool = False
63
 
 
64
  auto_eval_column_dict = []
65
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)])
66
  auto_eval_column_dict.append(["hardware", ColumnContent, ColumnContent("Hardware", "str", True, never_hidden=True)])
 
61
  never_hidden: bool = False
62
  dummy: bool = False
63
 
64
+
65
  auto_eval_column_dict = []
66
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)])
67
  auto_eval_column_dict.append(["hardware", ColumnContent, ColumnContent("Hardware", "str", True, never_hidden=True)])
src/leaderboard/filter_models.py CHANGED
@@ -29,9 +29,9 @@ def flag_models(leaderboard_data: list[dict]):
29
  FLAGGED_MODELS[model_data["model_name_for_query"]],
30
  f"See discussion #{issue_num}",
31
  )
32
- model_data[
33
- AutoEvalColumn.model.name
34
- ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
35
 
36
 
37
  def remove_forbidden_models(leaderboard_data: list[dict]):
 
29
  FLAGGED_MODELS[model_data["model_name_for_query"]],
30
  f"See discussion #{issue_num}",
31
  )
32
+ model_data[AutoEvalColumn.model.name] = (
33
+ f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
34
+ )
35
 
36
 
37
  def remove_forbidden_models(leaderboard_data: list[dict]):
src/leaderboard/read_evals.py CHANGED
@@ -5,6 +5,7 @@ from tqdm import tqdm
5
  from dataclasses import dataclass
6
 
7
  import dateutil
 
8
  # import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
@@ -32,13 +33,13 @@ class EvalResult:
32
  revision: str # commit hash, "" if main
33
  results: dict
34
  precision: Precision = Precision.Unknown
35
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
36
- weight_type: WeightType = WeightType.Original # Original or Adapter
37
- architecture: str = "Unknown" # From config file
38
  license: str = "?"
39
  likes: int = 0
40
  num_params: int = 0
41
- date: str = "" # submission date of request file
42
  still_on_hub: bool = False
43
 
44
  @staticmethod
@@ -67,7 +68,9 @@ class EvalResult:
67
  result_key = f"{org}_{model}_{precision.value.name}"
68
  full_model = "/".join(org_and_model)
69
 
70
- still_on_hub, error, model_config = is_model_on_hub(full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False)
 
 
71
  architecture = "?"
72
  if model_config is not None:
73
  architectures = getattr(model_config, "architectures", None)
@@ -79,35 +82,43 @@ class EvalResult:
79
  # data['results'] is {'nq_open': {'em': 0.24293628808864265, 'em_stderr': 0.007138697341112125}}
80
 
81
  results = {}
82
- for benchmark, benchmark_results in data['results'].items():
83
  if benchmark not in results:
84
  results[benchmark] = {}
85
 
86
  for metric, value in benchmark_results.items():
87
  to_add = True
88
- if '_stderr' in metric:
89
  to_add = False
90
- if 'alias' in metric:
91
  to_add = False
92
 
93
- if ',' in metric:
94
- metric = metric.split(',')[0]
95
  metric = metric.replace("exact_match", "em")
96
 
97
  if to_add is True:
98
  multiplier = 100.0
99
- if 'rouge' in metric and 'truthful' not in benchmark:
100
  multiplier = 1.0
101
- if 'squad' in benchmark:
102
  multiplier = 1.0
103
 
104
  # print('RESULTS', data['results'])
105
  # print('XXX', benchmark, metric, value, multiplier)
106
  results[benchmark][metric] = value * multiplier
107
 
108
- res = EvalResult(eval_name=result_key, full_model=full_model, org=org, model=model, results=results,
109
- precision=precision, revision=config.get("model_sha", ""), still_on_hub=still_on_hub,
110
- architecture=architecture)
 
 
 
 
 
 
 
 
111
 
112
  return res
113
 
@@ -183,6 +194,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
183
  request_file = tmp_request_file
184
  return request_file
185
 
 
186
  def get_request_file_for_model_open_llm(requests_path, model_name, precision):
187
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
188
  request_files = os.path.join(
@@ -197,16 +209,16 @@ def get_request_file_for_model_open_llm(requests_path, model_name, precision):
197
  for tmp_request_file in request_files:
198
  with open(tmp_request_file, "r") as f:
199
  req_content = json.load(f)
200
- if (
201
- req_content["status"] in ["FINISHED"]
202
- and req_content["precision"] == precision.split(".")[-1]
203
- ):
204
  request_file = tmp_request_file
205
  return request_file
206
 
 
207
  def update_model_type_with_open_llm_request_file(result, open_llm_requests_path):
208
  """Finds the relevant request file for the current model and updates info with it"""
209
- request_file = get_request_file_for_model_open_llm(open_llm_requests_path, result.full_model, result.precision.value.name)
 
 
210
 
211
  if request_file:
212
  try:
@@ -219,9 +231,8 @@ def update_model_type_with_open_llm_request_file(result, open_llm_requests_path)
219
  pass
220
  return result
221
 
222
- def get_raw_eval_results(results_path: str,
223
- requests_path: str,
224
- is_backend: bool = False) -> list[EvalResult]:
225
  """From the path of the results folder root, extract all needed info for results"""
226
  model_result_filepaths = []
227
 
 
5
  from dataclasses import dataclass
6
 
7
  import dateutil
8
+
9
  # import numpy as np
10
 
11
  from src.display.formatting import make_clickable_model
 
33
  revision: str # commit hash, "" if main
34
  results: dict
35
  precision: Precision = Precision.Unknown
36
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
37
+ weight_type: WeightType = WeightType.Original # Original or Adapter
38
+ architecture: str = "Unknown" # From config file
39
  license: str = "?"
40
  likes: int = 0
41
  num_params: int = 0
42
+ date: str = "" # submission date of request file
43
  still_on_hub: bool = False
44
 
45
  @staticmethod
 
68
  result_key = f"{org}_{model}_{precision.value.name}"
69
  full_model = "/".join(org_and_model)
70
 
71
+ still_on_hub, error, model_config = is_model_on_hub(
72
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
73
+ )
74
  architecture = "?"
75
  if model_config is not None:
76
  architectures = getattr(model_config, "architectures", None)
 
82
  # data['results'] is {'nq_open': {'em': 0.24293628808864265, 'em_stderr': 0.007138697341112125}}
83
 
84
  results = {}
85
+ for benchmark, benchmark_results in data["results"].items():
86
  if benchmark not in results:
87
  results[benchmark] = {}
88
 
89
  for metric, value in benchmark_results.items():
90
  to_add = True
91
+ if "_stderr" in metric:
92
  to_add = False
93
+ if "alias" in metric:
94
  to_add = False
95
 
96
+ if "," in metric:
97
+ metric = metric.split(",")[0]
98
  metric = metric.replace("exact_match", "em")
99
 
100
  if to_add is True:
101
  multiplier = 100.0
102
+ if "rouge" in metric and "truthful" not in benchmark:
103
  multiplier = 1.0
104
+ if "squad" in benchmark:
105
  multiplier = 1.0
106
 
107
  # print('RESULTS', data['results'])
108
  # print('XXX', benchmark, metric, value, multiplier)
109
  results[benchmark][metric] = value * multiplier
110
 
111
+ res = EvalResult(
112
+ eval_name=result_key,
113
+ full_model=full_model,
114
+ org=org,
115
+ model=model,
116
+ results=results,
117
+ precision=precision,
118
+ revision=config.get("model_sha", ""),
119
+ still_on_hub=still_on_hub,
120
+ architecture=architecture,
121
+ )
122
 
123
  return res
124
 
 
194
  request_file = tmp_request_file
195
  return request_file
196
 
197
+
198
  def get_request_file_for_model_open_llm(requests_path, model_name, precision):
199
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
200
  request_files = os.path.join(
 
209
  for tmp_request_file in request_files:
210
  with open(tmp_request_file, "r") as f:
211
  req_content = json.load(f)
212
+ if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
 
 
 
213
  request_file = tmp_request_file
214
  return request_file
215
 
216
+
217
  def update_model_type_with_open_llm_request_file(result, open_llm_requests_path):
218
  """Finds the relevant request file for the current model and updates info with it"""
219
+ request_file = get_request_file_for_model_open_llm(
220
+ open_llm_requests_path, result.full_model, result.precision.value.name
221
+ )
222
 
223
  if request_file:
224
  try:
 
231
  pass
232
  return result
233
 
234
+
235
+ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool = False) -> list[EvalResult]:
 
236
  """From the path of the results folder root, extract all needed info for results"""
237
  model_result_filepaths = []
238
 
src/populate.py CHANGED
@@ -13,17 +13,21 @@ from src.backend.envs import Tasks as BackendTasks
13
  from src.display.utils import Tasks
14
 
15
 
16
- def get_leaderboard_df(results_path: str,
17
- requests_path: str,
18
- requests_path_open_llm: str,
19
- cols: list,
20
- benchmark_cols: list,
21
- is_backend: bool = False) -> tuple[list[EvalResult], pd.DataFrame]:
 
 
22
  # Returns a list of EvalResult
23
  raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path, requests_path_open_llm)
24
  if requests_path_open_llm != "":
25
  for result_idx in tqdm(range(len(raw_data)), desc="updating model type with open llm leaderboard"):
26
- raw_data[result_idx] = update_model_type_with_open_llm_request_file(raw_data[result_idx], requests_path_open_llm)
 
 
27
 
28
  all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
29
 
 
13
  from src.display.utils import Tasks
14
 
15
 
16
+ def get_leaderboard_df(
17
+ results_path: str,
18
+ requests_path: str,
19
+ requests_path_open_llm: str,
20
+ cols: list,
21
+ benchmark_cols: list,
22
+ is_backend: bool = False,
23
+ ) -> tuple[list[EvalResult], pd.DataFrame]:
24
  # Returns a list of EvalResult
25
  raw_data: list[EvalResult] = get_raw_eval_results(results_path, requests_path, requests_path_open_llm)
26
  if requests_path_open_llm != "":
27
  for result_idx in tqdm(range(len(raw_data)), desc="updating model type with open llm leaderboard"):
28
+ raw_data[result_idx] = update_model_type_with_open_llm_request_file(
29
+ raw_data[result_idx], requests_path_open_llm
30
+ )
31
 
32
  all_data_json_ = [v.to_dict() for v in raw_data if v.is_complete()]
33
 
src/submission/check_validity.py CHANGED
@@ -40,20 +40,34 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
40
  return True, ""
41
 
42
 
43
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, Optional[str], Optional[AutoConfig]]:
 
 
44
  try:
45
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
46
  if test_tokenizer:
47
  try:
48
- AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
49
  except ValueError as e:
50
  return False, f"uses a tokenizer which is not in a transformers release: {e}", None
51
  except Exception as e:
52
- return False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None
 
 
 
 
53
  return True, None, config
54
 
55
  except ValueError as e:
56
- return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.", None
 
 
 
 
57
 
58
  except Exception as e:
59
  return False, f"was not found on hub -- {str(e)}", None
@@ -63,7 +77,7 @@ def get_model_size(model_info: ModelInfo, precision: str):
63
  size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
64
  try:
65
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
- except (AttributeError, TypeError ):
67
  try:
68
  size_match = re.search(size_pattern, model_info.modelId.lower())
69
  model_size = size_match.group(0)
@@ -75,9 +89,11 @@ def get_model_size(model_info: ModelInfo, precision: str):
75
  model_size = size_factor * model_size
76
  return model_size
77
 
 
78
  def get_model_arch(model_info: ModelInfo):
79
  return model_info.config.get("architectures", "Unknown")
80
 
 
81
  def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
82
  if org_or_user not in users_to_submission_dates:
83
  return True, ""
 
40
  return True, ""
41
 
42
 
43
+ def is_model_on_hub(
44
+ model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
45
+ ) -> tuple[bool, Optional[str], Optional[AutoConfig]]:
46
  try:
47
+ config = AutoConfig.from_pretrained(
48
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
49
+ )
50
  if test_tokenizer:
51
  try:
52
+ AutoTokenizer.from_pretrained(
53
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
54
+ )
55
  except ValueError as e:
56
  return False, f"uses a tokenizer which is not in a transformers release: {e}", None
57
  except Exception as e:
58
+ return (
59
+ False,
60
+ "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
61
+ None,
62
+ )
63
  return True, None, config
64
 
65
  except ValueError as e:
66
+ return (
67
+ False,
68
+ "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
69
+ None,
70
+ )
71
 
72
  except Exception as e:
73
  return False, f"was not found on hub -- {str(e)}", None
 
77
  size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
78
  try:
79
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
80
+ except (AttributeError, TypeError):
81
  try:
82
  size_match = re.search(size_pattern, model_info.modelId.lower())
83
  model_size = size_match.group(0)
 
89
  model_size = size_factor * model_size
90
  return model_size
91
 
92
+
93
  def get_model_arch(model_info: ModelInfo):
94
  return model_info.config.get("architectures", "Unknown")
95
 
96
+
97
  def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
98
  if org_or_user not in users_to_submission_dates:
99
  return True, ""
src/submission/submit.py CHANGED
@@ -61,7 +61,9 @@ def add_new_eval(
61
 
62
  # Is the model on the hub?
63
  if weight_type in ["Delta", "Adapter"]:
64
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=False)
 
 
65
  if not base_model_on_hub:
66
  return styled_error(f'Base model "{base_model}" {error}')
67
 
 
61
 
62
  # Is the model on the hub?
63
  if weight_type in ["Delta", "Adapter"]:
64
+ base_model_on_hub, error, _ = is_model_on_hub(
65
+ model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=False
66
+ )
67
  if not base_model_on_hub:
68
  return styled_error(f'Base model "{base_model}" {error}')
69
 
src/utils.py CHANGED
@@ -5,18 +5,21 @@ from huggingface_hub import snapshot_download
5
  def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
6
  for i in range(10):
7
  try:
8
- snapshot_download(repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers)
 
 
9
  return
10
  except Exception as e:
11
  print(f"Failed to download {repo_id} at {revision} with error: {e}. Retrying...")
12
  import time
 
13
  time.sleep(60)
14
  return
15
 
16
 
17
  def get_dataset_url(row):
18
- dataset_name = row['Benchmark']
19
- dataset_url = row['Dataset Link']
20
  benchmark = f'<a target="_blank" href="{dataset_url}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{dataset_name}</a>'
21
  return benchmark
22
 
@@ -24,8 +27,8 @@ def get_dataset_url(row):
24
  def get_dataset_summary_table(file_path):
25
  df = pd.read_csv(file_path)
26
 
27
- df['Benchmark'] = df.apply(lambda x: get_dataset_url(x), axis=1)
28
 
29
- df = df[['Category', 'Benchmark', 'Data Split', 'Data Size', 'Language']]
30
 
31
  return df
 
5
  def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
6
  for i in range(10):
7
  try:
8
+ snapshot_download(
9
+ repo_id=repo_id, revision=revision, local_dir=local_dir, repo_type=repo_type, max_workers=max_workers
10
+ )
11
  return
12
  except Exception as e:
13
  print(f"Failed to download {repo_id} at {revision} with error: {e}. Retrying...")
14
  import time
15
+
16
  time.sleep(60)
17
  return
18
 
19
 
20
  def get_dataset_url(row):
21
+ dataset_name = row["Benchmark"]
22
+ dataset_url = row["Dataset Link"]
23
  benchmark = f'<a target="_blank" href="{dataset_url}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{dataset_name}</a>'
24
  return benchmark
25
 
 
27
  def get_dataset_summary_table(file_path):
28
  df = pd.read_csv(file_path)
29
 
30
+ df["Benchmark"] = df.apply(lambda x: get_dataset_url(x), axis=1)
31
 
32
+ df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
33
 
34
  return df