support dynamic refresh page

#15
LICENSE DELETED
@@ -1,201 +0,0 @@
1
- Apache License
2
- Version 2.0, January 2004
3
- http://www.apache.org/licenses/
4
-
5
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
-
7
- 1. Definitions.
8
-
9
- "License" shall mean the terms and conditions for use, reproduction,
10
- and distribution as defined by Sections 1 through 9 of this document.
11
-
12
- "Licensor" shall mean the copyright owner or entity authorized by
13
- the copyright owner that is granting the License.
14
-
15
- "Legal Entity" shall mean the union of the acting entity and all
16
- other entities that control, are controlled by, or are under common
17
- control with that entity. For the purposes of this definition,
18
- "control" means (i) the power, direct or indirect, to cause the
19
- direction or management of such entity, whether by contract or
20
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
- outstanding shares, or (iii) beneficial ownership of such entity.
22
-
23
- "You" (or "Your") shall mean an individual or Legal Entity
24
- exercising permissions granted by this License.
25
-
26
- "Source" form shall mean the preferred form for making modifications,
27
- including but not limited to software source code, documentation
28
- source, and configuration files.
29
-
30
- "Object" form shall mean any form resulting from mechanical
31
- transformation or translation of a Source form, including but
32
- not limited to compiled object code, generated documentation,
33
- and conversions to other media types.
34
-
35
- "Work" shall mean the work of authorship, whether in Source or
36
- Object form, made available under the License, as indicated by a
37
- copyright notice that is included in or attached to the work
38
- (an example is provided in the Appendix below).
39
-
40
- "Derivative Works" shall mean any work, whether in Source or Object
41
- form, that is based on (or derived from) the Work and for which the
42
- editorial revisions, annotations, elaborations, or other modifications
43
- represent, as a whole, an original work of authorship. For the purposes
44
- of this License, Derivative Works shall not include works that remain
45
- separable from, or merely link (or bind by name) to the interfaces of,
46
- the Work and Derivative Works thereof.
47
-
48
- "Contribution" shall mean any work of authorship, including
49
- the original version of the Work and any modifications or additions
50
- to that Work or Derivative Works thereof, that is intentionally
51
- submitted to Licensor for inclusion in the Work by the copyright owner
52
- or by an individual or Legal Entity authorized to submit on behalf of
53
- the copyright owner. For the purposes of this definition, "submitted"
54
- means any form of electronic, verbal, or written communication sent
55
- to the Licensor or its representatives, including but not limited to
56
- communication on electronic mailing lists, source code control systems,
57
- and issue tracking systems that are managed by, or on behalf of, the
58
- Licensor for the purpose of discussing and improving the Work, but
59
- excluding communication that is conspicuously marked or otherwise
60
- designated in writing by the copyright owner as "Not a Contribution."
61
-
62
- "Contributor" shall mean Licensor and any individual or Legal Entity
63
- on behalf of whom a Contribution has been received by Licensor and
64
- subsequently incorporated within the Work.
65
-
66
- 2. Grant of Copyright License. Subject to the terms and conditions of
67
- this License, each Contributor hereby grants to You a perpetual,
68
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
- copyright license to reproduce, prepare Derivative Works of,
70
- publicly display, publicly perform, sublicense, and distribute the
71
- Work and such Derivative Works in Source or Object form.
72
-
73
- 3. Grant of Patent License. Subject to the terms and conditions of
74
- this License, each Contributor hereby grants to You a perpetual,
75
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
- (except as stated in this section) patent license to make, have made,
77
- use, offer to sell, sell, import, and otherwise transfer the Work,
78
- where such license applies only to those patent claims licensable
79
- by such Contributor that are necessarily infringed by their
80
- Contribution(s) alone or by combination of their Contribution(s)
81
- with the Work to which such Contribution(s) was submitted. If You
82
- institute patent litigation against any entity (including a
83
- cross-claim or counterclaim in a lawsuit) alleging that the Work
84
- or a Contribution incorporated within the Work constitutes direct
85
- or contributory patent infringement, then any patent licenses
86
- granted to You under this License for that Work shall terminate
87
- as of the date such litigation is filed.
88
-
89
- 4. Redistribution. You may reproduce and distribute copies of the
90
- Work or Derivative Works thereof in any medium, with or without
91
- modifications, and in Source or Object form, provided that You
92
- meet the following conditions:
93
-
94
- (a) You must give any other recipients of the Work or
95
- Derivative Works a copy of this License; and
96
-
97
- (b) You must cause any modified files to carry prominent notices
98
- stating that You changed the files; and
99
-
100
- (c) You must retain, in the Source form of any Derivative Works
101
- that You distribute, all copyright, patent, trademark, and
102
- attribution notices from the Source form of the Work,
103
- excluding those notices that do not pertain to any part of
104
- the Derivative Works; and
105
-
106
- (d) If the Work includes a "NOTICE" text file as part of its
107
- distribution, then any Derivative Works that You distribute must
108
- include a readable copy of the attribution notices contained
109
- within such NOTICE file, excluding those notices that do not
110
- pertain to any part of the Derivative Works, in at least one
111
- of the following places: within a NOTICE text file distributed
112
- as part of the Derivative Works; within the Source form or
113
- documentation, if provided along with the Derivative Works; or,
114
- within a display generated by the Derivative Works, if and
115
- wherever such third-party notices normally appear. The contents
116
- of the NOTICE file are for informational purposes only and
117
- do not modify the License. You may add Your own attribution
118
- notices within Derivative Works that You distribute, alongside
119
- or as an addendum to the NOTICE text from the Work, provided
120
- that such additional attribution notices cannot be construed
121
- as modifying the License.
122
-
123
- You may add Your own copyright statement to Your modifications and
124
- may provide additional or different license terms and conditions
125
- for use, reproduction, or distribution of Your modifications, or
126
- for any such Derivative Works as a whole, provided Your use,
127
- reproduction, and distribution of the Work otherwise complies with
128
- the conditions stated in this License.
129
-
130
- 5. Submission of Contributions. Unless You explicitly state otherwise,
131
- any Contribution intentionally submitted for inclusion in the Work
132
- by You to the Licensor shall be under the terms and conditions of
133
- this License, without any additional terms or conditions.
134
- Notwithstanding the above, nothing herein shall supersede or modify
135
- the terms of any separate license agreement you may have executed
136
- with Licensor regarding such Contributions.
137
-
138
- 6. Trademarks. This License does not grant permission to use the trade
139
- names, trademarks, service marks, or product names of the Licensor,
140
- except as required for reasonable and customary use in describing the
141
- origin of the Work and reproducing the content of the NOTICE file.
142
-
143
- 7. Disclaimer of Warranty. Unless required by applicable law or
144
- agreed to in writing, Licensor provides the Work (and each
145
- Contributor provides its Contributions) on an "AS IS" BASIS,
146
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
- implied, including, without limitation, any warranties or conditions
148
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
- PARTICULAR PURPOSE. You are solely responsible for determining the
150
- appropriateness of using or redistributing the Work and assume any
151
- risks associated with Your exercise of permissions under this License.
152
-
153
- 8. Limitation of Liability. In no event and under no legal theory,
154
- whether in tort (including negligence), contract, or otherwise,
155
- unless required by applicable law (such as deliberate and grossly
156
- negligent acts) or agreed to in writing, shall any Contributor be
157
- liable to You for damages, including any direct, indirect, special,
158
- incidental, or consequential damages of any character arising as a
159
- result of this License or out of the use or inability to use the
160
- Work (including but not limited to damages for loss of goodwill,
161
- work stoppage, computer failure or malfunction, or any and all
162
- other commercial damages or losses), even if such Contributor
163
- has been advised of the possibility of such damages.
164
-
165
- 9. Accepting Warranty or Additional Liability. While redistributing
166
- the Work or Derivative Works thereof, You may choose to offer,
167
- and charge a fee for, acceptance of support, warranty, indemnity,
168
- or other liability obligations and/or rights consistent with this
169
- License. However, in accepting such obligations, You may act only
170
- on Your own behalf and on Your sole responsibility, not on behalf
171
- of any other Contributor, and only if You agree to indemnify,
172
- defend, and hold each Contributor harmless for any liability
173
- incurred by, or claims asserted against, such Contributor by reason
174
- of your accepting any such warranty or additional liability.
175
-
176
- END OF TERMS AND CONDITIONS
177
-
178
- APPENDIX: How to apply the Apache License to your work.
179
-
180
- To apply the Apache License to your work, attach the following
181
- boilerplate notice, with the fields enclosed by brackets "[]"
182
- replaced with your own identifying information. (Don't include
183
- the brackets!) The text should be enclosed in the appropriate
184
- comment syntax for the file format. We also recommend that a
185
- file or class name and description of purpose be included on the
186
- same "printed page" as the copyright notice for easier
187
- identification within third-party archives.
188
-
189
- Copyright [yyyy] [name of copyright owner]
190
-
191
- Licensed under the Apache License, Version 2.0 (the "License");
192
- you may not use this file except in compliance with the License.
193
- You may obtain a copy of the License at
194
-
195
- http://www.apache.org/licenses/LICENSE-2.0
196
-
197
- Unless required by applicable law or agreed to in writing, software
198
- distributed under the License is distributed on an "AS IS" BASIS,
199
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
- See the License for the specific language governing permissions and
201
- limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🔥
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 4.26.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
 
4
  colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.9.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
app.py CHANGED
@@ -1,8 +1,8 @@
1
  #!/usr/bin/env python
 
2
  import os
3
  import datetime
4
  import socket
5
- import base64
6
  from threading import Thread
7
 
8
  import gradio as gr
@@ -11,7 +11,6 @@ import time
11
  from apscheduler.schedulers.background import BackgroundScheduler
12
 
13
  from huggingface_hub import snapshot_download
14
- from pytz import utc
15
 
16
  from src.display.about import (
17
  CITATION_BUTTON_LABEL,
@@ -22,7 +21,6 @@ from src.display.about import (
22
  LLM_BENCHMARKS_DETAILS,
23
  FAQ_TEXT,
24
  TITLE,
25
- ACKNOWLEDGEMENT_TEXT,
26
  )
27
 
28
  from src.display.css_html_js import custom_css
@@ -39,7 +37,6 @@ from src.display.utils import (
39
  fields,
40
  WeightType,
41
  Precision,
42
- GPUType
43
  )
44
 
45
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, \
@@ -76,7 +73,7 @@ def restart_space():
76
 
77
 
78
  def init_space():
79
- # dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
80
 
81
  if socket.gethostname() not in {"neuromancer"}:
82
  # sync model_type with open-llm-leaderboard
@@ -91,19 +88,7 @@ def init_space():
91
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
92
  EVAL_REQUESTS_PATH, EVAL_COLS
93
  )
94
- # return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
95
- return None, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
96
-
97
-
98
- def add_benchmark_columns(shown_columns):
99
- benchmark_columns = []
100
- for benchmark in BENCHMARK_COLS:
101
- if benchmark in shown_columns:
102
- for c in COLS:
103
- if benchmark in c and benchmark != c:
104
- benchmark_columns.append(c)
105
- return benchmark_columns
106
-
107
 
108
  # Searching and filtering
109
  def update_table(
@@ -111,8 +96,7 @@ def update_table(
111
  ):
112
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
113
  filtered_df = filter_queries(query, filtered_df)
114
- benchmark_columns = add_benchmark_columns(columns)
115
- df = select_columns(filtered_df, columns + benchmark_columns)
116
  return df
117
 
118
 
@@ -160,7 +144,6 @@ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precisio
160
  type_emoji = [t[0] for t in type_query]
161
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
162
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
163
- filtered_df = filtered_df.loc[df[AutoEvalColumn.inference_framework.name].isin(size_query)]
164
 
165
  # numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
166
  # params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
@@ -173,176 +156,154 @@ shown_columns = None
173
  dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
174
  leaderboard_df = original_df.copy()
175
 
176
- # def update_leaderboard_table():
177
- # global leaderboard_df, shown_columns
178
- # print("Updating leaderboard table")
179
- # return leaderboard_df[
180
- # [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
181
- # + shown_columns.value
182
- # + [AutoEvalColumn.dummy.name]
183
- # ] if not leaderboard_df.empty else leaderboard_df
184
 
185
 
186
- # def update_hidden_leaderboard_table():
187
- # global original_df
188
- # return original_df[COLS] if original_df.empty is False else original_df
189
 
190
- # def update_dataset_table():
191
- # global dataset_df
192
- # return dataset_df
193
 
194
- # def update_finish_table():
195
- # global finished_eval_queue_df
196
- # return finished_eval_queue_df
197
 
198
- # def update_running_table():
199
- # global running_eval_queue_df
200
- # return running_eval_queue_df
201
 
202
- # def update_pending_table():
203
- # global pending_eval_queue_df
204
- # return pending_eval_queue_df
205
 
206
- # def update_finish_num():
207
- # global finished_eval_queue_df
208
- # return len(finished_eval_queue_df)
209
 
210
- # def update_running_num():
211
- # global running_eval_queue_df
212
- # return len(running_eval_queue_df)
213
 
214
- # def update_pending_num():
215
- # global pending_eval_queue_df
216
- # return len(pending_eval_queue_df)
217
 
218
  # triggered only once at startup => read query parameter if it exists
219
  def load_query(request: gr.Request):
220
  query = request.query_params.get("query") or ""
221
  return query
222
-
223
-
224
- def get_image_html(url, image_path):
225
- with open(image_path, "rb") as image_file:
226
- encoded_string = base64.b64encode(image_file.read()).decode()
227
- return f'<a href="{url}" target="_blank"><img src="data:image/jpg;base64,{encoded_string}" alt="NetMind.AI Logo" style="width:100pt;"></a>'
228
-
229
-
230
- # Prepare the HTML content with the image
231
- image_html = get_image_html("https://netmind.ai/home", "./src/display/imgs/Netmind.AI_LOGO.jpg")
232
-
233
-
234
- demo = gr.Blocks(css=custom_css)
235
- with demo:
236
- gr.HTML(TITLE)
237
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
238
- gr.HTML(ACKNOWLEDGEMENT_TEXT.format(image_html=image_html))
239
-
240
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
241
- with gr.TabItem("open-moe-llm-leaderboard", elem_id="llm-benchmark-tab-table", id=0):
242
- with gr.Row():
243
- with gr.Column():
244
- with gr.Row():
245
- search_bar = gr.Textbox(
246
- placeholder=" 🔍 Model search (separate multiple queries with `;`)",
247
- show_label=False,
248
- elem_id="search-bar"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  )
250
- with gr.Row():
251
- shown_columns = gr.CheckboxGroup(
252
- choices=[
253
- c.name
254
- for c in fields(AutoEvalColumn)
255
- if not c.hidden and not c.never_hidden and not c.dummy
256
- ],
257
- value=[
258
- c.name
259
- for c in fields(AutoEvalColumn)
260
- if c.displayed_by_default and not c.hidden and not c.never_hidden
261
- ],
262
- label="Tasks",
263
- elem_id="column-select",
264
  interactive=True,
 
265
  )
266
-
267
- with gr.Column(min_width=320):
268
- filter_columns_size = gr.CheckboxGroup(
269
- label="Inference frameworks",
270
- choices=[t.to_str() for t in InferenceFramework],
271
- value=[t.to_str() for t in InferenceFramework],
272
- interactive=True,
273
- elem_id="filter-columns-size",
274
- )
275
-
276
- filter_columns_type = gr.CheckboxGroup(
277
- label="Model types",
278
- choices=[t.to_str() for t in ModelType],
279
- value=[t.to_str() for t in ModelType],
280
- interactive=True,
281
- elem_id="filter-columns-type",
282
- )
283
-
284
- filter_columns_precision = gr.CheckboxGroup(
285
- label="Precision",
286
- choices=[i.value.name for i in Precision],
287
- value=[i.value.name for i in Precision],
288
- interactive=True,
289
- elem_id="filter-columns-precision",
290
- )
291
-
292
- # filter_columns_size = gr.CheckboxGroup(
293
- # label="Model sizes (in billions of parameters)",
294
- # choices=list(NUMERIC_INTERVALS.keys()),
295
- # value=list(NUMERIC_INTERVALS.keys()),
296
- # interactive=True,
297
- # elem_id="filter-columns-size",
298
- # )
299
-
300
- # breakpoint()
301
- benchmark_columns = add_benchmark_columns(shown_columns.value)
302
- leaderboard_table = gr.components.Dataframe(
303
- value=(
304
- leaderboard_df[
305
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
306
- + shown_columns.value
307
- + benchmark_columns
308
- + [AutoEvalColumn.dummy.name]
309
- ]
310
- if leaderboard_df.empty is False
311
- else leaderboard_df
312
- ),
313
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + benchmark_columns,
314
- datatype=TYPES,
315
- elem_id="leaderboard-table",
316
- interactive=False,
317
- visible=True,
318
- ) # column_widths=["2%", "20%"]
319
-
320
- # Dummy leaderboard for handling the case when the user uses backspace key
321
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
322
- value=original_df[COLS] if original_df.empty is False else original_df,
323
- headers=COLS,
324
- datatype=TYPES,
325
- visible=False,
326
- )
327
-
328
- search_bar.submit(
329
- update_table,
330
- [
331
- hidden_leaderboard_table_for_search,
332
- shown_columns,
333
- filter_columns_type,
334
- filter_columns_precision,
335
- filter_columns_size,
336
- search_bar,
337
- ],
338
- leaderboard_table
339
- )
340
-
341
- # Check query parameter once at startup and update search bar
342
- demo.load(load_query, inputs=[], outputs=[search_bar])
343
-
344
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
345
- selector.change(
346
  update_table,
347
  [
348
  hidden_leaderboard_table_for_search,
@@ -353,133 +314,137 @@ with demo:
353
  search_bar,
354
  ],
355
  leaderboard_table,
356
- queue=True,
357
  )
358
-
359
- # with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
360
- # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
361
-
362
- # dataset_table = gr.components.Dataframe(
363
- # value=dataset_df,
364
- # headers=list(dataset_df.columns),
365
- # datatype=["str", "markdown", "str", "str", "str"],
366
- # elem_id="dataset-table",
367
- # interactive=False,
368
- # visible=True,
369
- # column_widths=["15%", "20%"],
370
- # )
371
-
372
- # gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
373
- # gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
374
-
375
- with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
376
- with gr.Column():
377
- with gr.Row():
378
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
379
-
380
- with gr.Column():
381
- with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
382
- with gr.Row():
383
- finished_eval_table = gr.components.Dataframe(
384
- value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
385
- )
386
-
387
- with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
388
- with gr.Row():
389
- running_eval_table = gr.components.Dataframe(
390
- value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
391
- )
392
-
393
- with gr.Accordion(f"⏳ Scheduled Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
394
- with gr.Row():
395
- pending_eval_table = gr.components.Dataframe(
396
- value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
397
- )
398
-
399
- with gr.Row():
400
- gr.Markdown("# Submit your model here", elem_classes="markdown-text")
401
-
402
- with gr.Row():
403
- inference_framework = gr.Dropdown(
404
- choices=[t.to_str() for t in InferenceFramework],
405
- label="Inference framework",
406
- multiselect=False,
407
- value=None,
408
- interactive=True,
409
- )
410
-
411
- gpu_type = gr.Dropdown(
412
- choices=[t.to_str() for t in GPUType],
413
- label="GPU type",
414
- multiselect=False,
415
- value="NVIDIA-A100-PCIe-80GB",
416
- interactive=True,
417
  )
418
-
419
-
420
- with gr.Row():
 
421
  with gr.Column():
422
- model_name_textbox = gr.Textbox(label="Model name")
423
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
424
- private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
425
- model_type = gr.Dropdown(
426
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
427
- label="Model type",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  multiselect=False,
429
  value=None,
430
  interactive=True,
431
  )
432
-
433
- with gr.Column():
434
- precision = gr.Dropdown(
435
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
436
- label="Precision",
437
- multiselect=False,
438
- value="float32",
439
- interactive=True,
440
- )
441
-
442
- weight_type = gr.Dropdown(
443
- choices=[i.value.name for i in WeightType],
444
- label="Weights type",
445
- multiselect=False,
446
- value="Original",
447
- interactive=True,
448
- )
449
-
450
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
451
-
452
- submit_button = gr.Button("Submit Eval")
453
- submission_result = gr.Markdown()
454
- debug = gr.Checkbox(value=args.debug, label="Debug", visible=False)
455
- submit_button.click(
456
- add_new_eval,
457
- [
458
- model_name_textbox,
459
- base_model_name_textbox,
460
- revision_name_textbox,
461
- precision,
462
- private,
463
- weight_type,
464
- model_type,
465
- inference_framework,
466
- debug,
467
- gpu_type
468
- ],
469
- submission_result,
470
- )
471
-
472
- with gr.Row():
473
- with gr.Accordion("Citing this leaderboard", open=False):
474
- citation_button = gr.Textbox(
475
- value=CITATION_BUTTON_TEXT,
476
- label=CITATION_BUTTON_LABEL,
477
- lines=20,
478
- elem_id="citation-button",
479
- show_copy_button=True,
480
- )
481
-
482
- scheduler = BackgroundScheduler(timezone=utc)
 
 
 
 
 
 
 
 
 
 
 
483
 
484
  scheduler.add_job(restart_space, "interval", hours=6)
485
 
@@ -490,9 +455,9 @@ def launch_backend():
490
  if DEVICE not in {"cpu"}:
491
  _ = subprocess.run(["python", "backend-cli.py"])
492
 
493
- # Thread(target=periodic_init, daemon=True).start()
494
  # scheduler.add_job(launch_backend, "interval", seconds=120)
495
  if __name__ == "__main__":
496
  scheduler.start()
497
- demo.queue(default_concurrency_limit=40).launch()
498
 
 
1
  #!/usr/bin/env python
2
+
3
  import os
4
  import datetime
5
  import socket
 
6
  from threading import Thread
7
 
8
  import gradio as gr
 
11
  from apscheduler.schedulers.background import BackgroundScheduler
12
 
13
  from huggingface_hub import snapshot_download
 
14
 
15
  from src.display.about import (
16
  CITATION_BUTTON_LABEL,
 
21
  LLM_BENCHMARKS_DETAILS,
22
  FAQ_TEXT,
23
  TITLE,
 
24
  )
25
 
26
  from src.display.css_html_js import custom_css
 
37
  fields,
38
  WeightType,
39
  Precision,
 
40
  )
41
 
42
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, \
 
73
 
74
 
75
  def init_space():
76
+ dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
77
 
78
  if socket.gethostname() not in {"neuromancer"}:
79
  # sync model_type with open-llm-leaderboard
 
88
  finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
89
  EVAL_REQUESTS_PATH, EVAL_COLS
90
  )
91
+ return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  # Searching and filtering
94
  def update_table(
 
96
  ):
97
  filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
98
  filtered_df = filter_queries(query, filtered_df)
99
+ df = select_columns(filtered_df, columns)
 
100
  return df
101
 
102
 
 
144
  type_emoji = [t[0] for t in type_query]
145
  filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
146
  filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
 
147
 
148
  # numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
149
  # params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
 
156
  dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
157
  leaderboard_df = original_df.copy()
158
 
159
+ def update_leaderboard_table():
160
+ global leaderboard_df, shown_columns
161
+ print("Updating leaderboard table")
162
+ return leaderboard_df[
163
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
164
+ + shown_columns.value
165
+ + [AutoEvalColumn.dummy.name]
166
+ ] if not leaderboard_df.empty else leaderboard_df
167
 
168
 
169
+ def update_hidden_leaderboard_table():
170
+ global original_df
171
+ return original_df[COLS] if original_df.empty is False else original_df
172
 
173
+ def update_dataset_table():
174
+ global dataset_df
175
+ return dataset_df
176
 
177
+ def update_finish_table():
178
+ global finished_eval_queue_df
179
+ return finished_eval_queue_df
180
 
181
+ def update_running_table():
182
+ global running_eval_queue_df
183
+ return running_eval_queue_df
184
 
185
+ def update_pending_table():
186
+ global pending_eval_queue_df
187
+ return pending_eval_queue_df
188
 
189
+ def update_finish_num():
190
+ global finished_eval_queue_df
191
+ return len(finished_eval_queue_df)
192
 
193
+ def update_running_num():
194
+ global running_eval_queue_df
195
+ return len(running_eval_queue_df)
196
 
197
+ def update_pending_num():
198
+ global pending_eval_queue_df
199
+ return len(pending_eval_queue_df)
200
 
201
  # triggered only once at startup => read query parameter if it exists
202
  def load_query(request: gr.Request):
203
  query = request.query_params.get("query") or ""
204
  return query
205
+
206
+ def refresh_leaderboard():
207
+ return gr.update(value=update_leaderboard_table()), gr.update(value=update_hidden_leaderboard_table()), \
208
+ gr.update(value=update_dataset_table()), gr.update(value=update_finish_table()), \
209
+ gr.update(value=update_running_table()), gr.update(value=update_pending_table()), \
210
+ gr.update(value=update_finish_num()), gr.update(value=update_running_num()), gr.update(value=update_pending_num())
211
+
212
+ def periodic_init():
213
+ global dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, leaderboard_df
214
+ while True:
215
+ time.sleep(60)
216
+ dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
217
+ leaderboard_df = original_df.copy()
218
+
219
+ def block_launch():
220
+ global dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, leaderboard_df, shown_columns
221
+ demo = gr.Blocks(css=custom_css)
222
+ with demo:
223
+ gr.HTML(TITLE)
224
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
225
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
226
+ with gr.TabItem("MOE-LLM-GPU-Poor-Leaderboard Benchmark", elem_id="llm-benchmark-tab-table", id=0):
227
+ with gr.Row():
228
+ with gr.Column():
229
+ with gr.Row():
230
+ search_bar = gr.Textbox(
231
+ placeholder=" 🔍 Model search (separate multiple queries with `;`)",
232
+ show_label=False,
233
+ elem_id="search-bar",
234
+ )
235
+ with gr.Row():
236
+ shown_columns = gr.CheckboxGroup(
237
+ choices=[
238
+ c.name
239
+ for c in fields(AutoEvalColumn)
240
+ if not c.hidden and not c.never_hidden and not c.dummy
241
+ ],
242
+ value=[
243
+ c.name
244
+ for c in fields(AutoEvalColumn)
245
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
246
+ ],
247
+ label="Select columns to show",
248
+ elem_id="column-select",
249
+ interactive=True,
250
+ )
251
+ with gr.Column(min_width=320):
252
+ filter_columns_size = gr.CheckboxGroup(
253
+ label="Inference frameworks",
254
+ choices=[t.to_str() for t in InferenceFramework],
255
+ value=[t.to_str() for t in InferenceFramework],
256
+ interactive=True,
257
+ elem_id="filter-columns-size",
258
  )
259
+ filter_columns_type = gr.CheckboxGroup(
260
+ label="Model types",
261
+ choices=[t.to_str() for t in ModelType],
262
+ value=[t.to_str() for t in ModelType],
 
 
 
 
 
 
 
 
 
 
263
  interactive=True,
264
+ elem_id="filter-columns-type",
265
  )
266
+ filter_columns_precision = gr.CheckboxGroup(
267
+ label="Precision",
268
+ choices=[i.value.name for i in Precision],
269
+ value=[i.value.name for i in Precision],
270
+ interactive=True,
271
+ elem_id="filter-columns-precision",
272
+ )
273
+ # filter_columns_size = gr.CheckboxGroup(
274
+ # label="Model sizes (in billions of parameters)",
275
+ # choices=list(NUMERIC_INTERVALS.keys()),
276
+ # value=list(NUMERIC_INTERVALS.keys()),
277
+ # interactive=True,
278
+ # elem_id="filter-columns-size",
279
+ # )
280
+ # breakpoint()
281
+ refresh_button = gr.Button("Refresh", visible=True)
282
+ leaderboard_table = gr.components.Dataframe(
283
+ value=(
284
+ leaderboard_df[
285
+ [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
286
+ + shown_columns.value
287
+ + [AutoEvalColumn.dummy.name]
288
+ ]
289
+ if leaderboard_df.empty is False
290
+ else leaderboard_df
291
+ ),
292
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
293
+ datatype=TYPES,
294
+ elem_id="leaderboard-table",
295
+ interactive=False,
296
+ visible=True,
297
+ ) # column_widths=["2%", "20%"]
298
+ # Dummy leaderboard for handling the case when the user uses backspace key
299
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
300
+ value=original_df[COLS] if original_df.empty is False else original_df,
301
+ headers=COLS,
302
+ datatype=TYPES,
303
+ visible=False,
304
+ )
305
+ # refresh_button.click(fn=update_leaderboard_tables, outputs=[leaderboard_table, hidden_leaderboard_table_for_search])
306
+ search_bar.submit(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  update_table,
308
  [
309
  hidden_leaderboard_table_for_search,
 
314
  search_bar,
315
  ],
316
  leaderboard_table,
 
317
  )
318
+ # Check query parameter once at startup and update search bar
319
+ demo.load(load_query, inputs=[], outputs=[search_bar])
320
+ for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
321
+ selector.change(
322
+ update_table,
323
+ [
324
+ hidden_leaderboard_table_for_search,
325
+ shown_columns,
326
+ filter_columns_type,
327
+ filter_columns_precision,
328
+ filter_columns_size,
329
+ search_bar,
330
+ ],
331
+ leaderboard_table,
332
+ queue=True,
333
+ )
334
+ with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
335
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
336
+ dataset_table = gr.components.Dataframe(
337
+ value=dataset_df,
338
+ headers=list(dataset_df.columns),
339
+ datatype=["str", "markdown", "str", "str", "str"],
340
+ elem_id="dataset-table",
341
+ interactive=False,
342
+ visible=True,
343
+ column_widths=["15%", "20%"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  )
345
+ gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
346
+ gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
347
+ # refresh_button.click(fn=update_dataset_table, outputs=[dataset_table])
348
+ with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
349
  with gr.Column():
350
+ with gr.Row():
351
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
352
+ with gr.Column():
353
+ with gr.Accordion(f"✅ Finished Evaluations", open=False):
354
+ with gr.Column():
355
+ num_fin = gr.Number(len(finished_eval_queue_df), label="Number of finished evaluations", visible=True, interactive=False)
356
+ with gr.Row():
357
+ finished_eval_table = gr.components.Dataframe(
358
+ value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=6
359
+ )
360
+ with gr.Accordion(f"🔄 Running Evaluation Queue", open=False):
361
+ with gr.Column():
362
+ num_run = gr.Number(len(running_eval_queue_df), label="Number of running evaluations", visible=True, interactive=False)
363
+ with gr.Row():
364
+ running_eval_table = gr.components.Dataframe(
365
+ value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=6
366
+ )
367
+ with gr.Accordion(f"⏳ Scheduled Evaluation Queue", open=False):
368
+ with gr.Column():
369
+ num_sche = gr.Number(len(pending_eval_queue_df), label="Number of scheduled evaluations", visible=True, interactive=False)
370
+ with gr.Row():
371
+ pending_eval_table = gr.components.Dataframe(
372
+ value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=6
373
+ )
374
+ # refresh_button.click(fn=update_submit_tables,
375
+ # outputs=[finished_eval_table, running_eval_table, pending_eval_table])
376
+ with gr.Row():
377
+ gr.Markdown("# Submit your model here", elem_classes="markdown-text")
378
+ with gr.Row():
379
+ inference_framework = gr.Dropdown(
380
+ choices=[t.to_str() for t in InferenceFramework],
381
+ label="Inference framework",
382
  multiselect=False,
383
  value=None,
384
  interactive=True,
385
  )
386
+ with gr.Row():
387
+ with gr.Column():
388
+ model_name_textbox = gr.Textbox(label="Model name")
389
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
390
+ private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
391
+ model_type = gr.Dropdown(
392
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
393
+ label="Model type",
394
+ multiselect=False,
395
+ value=None,
396
+ interactive=True,
397
+ )
398
+ with gr.Column():
399
+ precision = gr.Dropdown(
400
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
401
+ label="Precision",
402
+ multiselect=False,
403
+ value="float32",
404
+ interactive=True,
405
+ )
406
+ weight_type = gr.Dropdown(
407
+ choices=[i.value.name for i in WeightType],
408
+ label="Weights type",
409
+ multiselect=False,
410
+ value="Original",
411
+ interactive=True,
412
+ )
413
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
414
+ submit_button = gr.Button("Submit Eval")
415
+ submission_result = gr.Markdown()
416
+ debug = gr.Checkbox(args.debug, label="Debug", visible=False)
417
+ submit_button.click(
418
+ add_new_eval,
419
+ [
420
+ model_name_textbox,
421
+ base_model_name_textbox,
422
+ revision_name_textbox,
423
+ precision,
424
+ private,
425
+ weight_type,
426
+ model_type,
427
+ inference_framework,
428
+ debug
429
+ ],
430
+ submission_result,
431
+ )
432
+ refresh_button.click(refresh_leaderboard,
433
+ outputs=[leaderboard_table, hidden_leaderboard_table_for_search, dataset_table,
434
+ finished_eval_table, running_eval_table, pending_eval_table, num_fin, num_run, num_sche])
435
+
436
+ with gr.Row():
437
+ with gr.Accordion("Citing this leaderboard", open=False):
438
+ citation_button = gr.Textbox(
439
+ value=CITATION_BUTTON_TEXT,
440
+ label=CITATION_BUTTON_LABEL,
441
+ lines=20,
442
+ elem_id="citation-button",
443
+ show_copy_button=True,
444
+ )
445
+ demo.queue(default_concurrency_limit=40).launch()
446
+
447
+ scheduler = BackgroundScheduler()
448
 
449
  scheduler.add_job(restart_space, "interval", hours=6)
450
 
 
455
  if DEVICE not in {"cpu"}:
456
  _ = subprocess.run(["python", "backend-cli.py"])
457
 
458
+ Thread(target=periodic_init, daemon=True).start()
459
  # scheduler.add_job(launch_backend, "interval", seconds=120)
460
  if __name__ == "__main__":
461
  scheduler.start()
462
+ block_launch()
463
 
backend-cli.py CHANGED
@@ -6,7 +6,6 @@ import argparse
6
 
7
  import socket
8
  import random
9
- import threading
10
  from datetime import datetime
11
 
12
  from src.backend.run_eval_suite import run_evaluation
@@ -16,20 +15,18 @@ from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PAT
16
  from src.backend.manage_requests import EvalRequest
17
  from src.leaderboard.read_evals import EvalResult
18
 
19
- from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
20
- from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus, get_gpu_details
21
 
22
  from src.leaderboard.read_evals import get_raw_eval_results
23
 
24
  from typing import Optional
25
- import GPUtil
26
  import time
27
 
28
  import pprint
29
  import logging
30
 
31
- from lm_eval.filters.extraction import RegexFilter
32
-
33
 
34
  # Configure the root logger
35
  logging.basicConfig(
@@ -44,20 +41,6 @@ eval_logger = logging.getLogger("lm-eval")
44
  # Explicitly set the level for 'lm-eval' logger to WARNING
45
  eval_logger.setLevel(logging.WARNING)
46
 
47
- def tuple_input_decorator(func):
48
- def wrapper(self, resps, docs):
49
- stripped_resps = [[resp_data[0] for resp_data in group] for group in resps]
50
-
51
- filtered_resps = func(self, stripped_resps, docs)
52
-
53
- combined_resps = []
54
- for original_group, new_group in zip(resps, filtered_resps):
55
- combined_group = [(new_resp,) + rest_of_data[1:] for new_resp, rest_of_data in zip(new_group, original_group)]
56
- combined_resps.append(combined_group)
57
-
58
- return combined_resps
59
- return wrapper
60
-
61
 
62
  def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
63
  for i in range(10):
@@ -140,23 +123,7 @@ def request_to_result_name(request: EvalRequest) -> str:
140
 
141
 
142
  def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
143
- batch_size = 1
144
- batch_size = eval_request.batch_size
145
-
146
- init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
147
- # if init_gpu_info['Mem(M)'] > 500:
148
- # assert False, f"This machine is not empty: {init_gpu_info}"
149
- gpu_stats_list = []
150
- stop_event = threading.Event()
151
- monitor_thread = threading.Thread(target=monitor_gpus, args=(stop_event, 5, gpu_stats_list))
152
- monitor_thread.start()
153
-
154
- original_apply = RegexFilter.apply
155
- if task.benchmark in ["gsm8k", "gsm8k_cot", "gsm8k_cot_self_consistency", "gsm8k_custom"]:
156
- RegexFilter.apply = tuple_input_decorator(RegexFilter.apply)
157
- else:
158
- RegexFilter.apply = original_apply
159
-
160
  try:
161
  results = run_evaluation(
162
  eval_request=eval_request,
@@ -183,20 +150,6 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
183
  raise
184
 
185
  # print("RESULTS", results)
186
- stop_event.set()
187
- monitor_thread.join()
188
- gpu_info = analyze_gpu_stats(gpu_stats_list)
189
- for task_name in results['results'].keys():
190
- for key, value in gpu_info.items():
191
- if "GPU" not in key:
192
- results['results'][task_name][f"{key},none"] = int(value)
193
- else:
194
- results['results'][task_name][f"{key},none"] = value
195
-
196
- results['results'][task_name]['batch_size,none'] = batch_size
197
- results['results'][task_name]['precision,none'] = eval_request.precision
198
- print(f"gpu_stats_list: {gpu_stats_list}")
199
- print("GPU Usage:", gpu_info)
200
 
201
  dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
202
  # print(dumped)
@@ -217,8 +170,6 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
217
  repo_id=RESULTS_REPO,
218
  repo_type="dataset",
219
  )
220
-
221
- RegexFilter.apply = original_apply
222
  return results
223
 
224
 
@@ -387,9 +338,10 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
387
 
388
  return False
389
 
 
390
  def process_pending_requests() -> bool:
391
  sanity_checks()
392
- print("Processing pending requests")
393
  current_pending_status = [PENDING_STATUS]
394
 
395
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
@@ -408,12 +360,6 @@ def process_pending_requests() -> bool:
408
 
409
  eval_request = eval_requests[0]
410
  pp.pprint(eval_request)
411
-
412
- gpu_type = eval_request.gpu_type
413
- curr_gpu_type = get_gpu_details()
414
- if gpu_type != curr_gpu_type:
415
- print(f"GPU type mismatch: {gpu_type} vs {curr_gpu_type}")
416
- return False
417
 
418
  my_snapshot_download(
419
  repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
@@ -450,15 +396,11 @@ def get_args():
450
  parser = argparse.ArgumentParser(description="Run the backend")
451
  parser.add_argument("--debug", action="store_true", help="Run in debug mode")
452
  # debug parameters
453
- parser.add_argument("--task", type=str, default="selfcheckgpt,mmlu, gsm8k", help="Task to debug")
454
- parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1,mistralai/Mixtral-8x7B-v0.1", help="Model to debug")
455
- parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
456
  parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
457
  parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
458
- parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
459
- help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
460
- parser.add_argument("--debug_repo", action="store_true", help="Use debug repo")
461
- parser.add_argument("--model_type", type=str, default="chat", help="Model type")
462
  return parser.parse_args()
463
 
464
 
@@ -466,76 +408,43 @@ if __name__ == "__main__":
466
  args = get_args()
467
  local_debug = args.debug
468
  # debug specific task by ping
469
- if local_debug and not args.debug_repo:
470
- # debug_model_names = [args.model] # Use model from arguments
471
- # debug_task_name = [args.task] # Use task from arguments
472
- debug_model_names = args.model.split(",")
473
- debug_task_name = args.task.split(",")
474
- precisions = args.precision.split(",")
475
- print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
476
  task_lst = TASKS_HARNESS.copy()
477
- RESULTS_REPO = DEBUG_RESULTS_REPO
478
- for precision in precisions:
479
  for debug_model_name in debug_model_names:
480
- for task in task_lst:
481
- task_name = task.benchmark
482
- if task_name not in debug_task_name:
483
- continue
484
- # try:
485
- eval_request = EvalRequest(
486
- model=debug_model_name,
487
- private=False,
488
- status="",
489
- json_filepath="",
490
- precision=precision, # Use precision from arguments
491
- inference_framework=args.inference_framework, # Use inference framework from arguments
492
- gpu_type=args.gpu_type,
493
- model_type=args.model_type,
494
- )
495
- curr_gpu_type = get_gpu_details()
496
- if eval_request.gpu_type != curr_gpu_type:
497
- print(f"GPU type mismatch: {eval_request.gpu_type} vs {curr_gpu_type}")
498
- raise Exception("GPU type mismatch")
499
- results = process_evaluation(task, eval_request, limit=args.limit)
500
- # except Exception as e:
501
- # print(f"debug running error: {e}")
502
- elif local_debug and args.debug_repo:
503
- QUEUE_REPO = DEBUG_QUEUE_REPO
504
- RESULTS_REPO = DEBUG_RESULTS_REPO
505
  while True:
506
  res = False
 
507
  # if random.randint(0, 10) == 0:
508
  res = process_pending_requests()
509
  print(f"waiting for 60 seconds")
510
  time.sleep(60)
 
511
  # if res is False:
512
  # if random.randint(0, 5) == 0:
513
  # res = maybe_refresh_results(100)
514
  # else:
515
  # res = process_finished_requests(100)
 
516
  # time.sleep(60)
 
517
  # if res is False:
518
  # if random.randint(0, 5) == 0:
519
  # res = maybe_refresh_results(0)
520
  # else:
521
  # res = process_finished_requests(0)
522
- elif not local_debug and not args.debug_repo:
523
- while True:
524
- res = False
525
- # if random.randint(0, 10) == 0:
526
- res = process_pending_requests()
527
- print(f"waiting for 60 seconds")
528
- time.sleep(60)
529
- # if res is False:
530
- # if random.randint(0, 5) == 0:
531
- # res = maybe_refresh_results(100)
532
- # else:
533
- # res = process_finished_requests(100)
534
- # time.sleep(60)
535
- # if res is False:
536
- # if random.randint(0, 5) == 0:
537
- # res = maybe_refresh_results(0)
538
- # else:
539
- # res = process_finished_requests(0)
540
- else:
541
- raise Exception("Cannot use debug_repo without local debug flag")
 
6
 
7
  import socket
8
  import random
 
9
  from datetime import datetime
10
 
11
  from src.backend.run_eval_suite import run_evaluation
 
15
  from src.backend.manage_requests import EvalRequest
16
  from src.leaderboard.read_evals import EvalResult
17
 
18
+ from src.envs import QUEUE_REPO, RESULTS_REPO, API
19
+ from src.utils import my_snapshot_download
20
 
21
  from src.leaderboard.read_evals import get_raw_eval_results
22
 
23
  from typing import Optional
24
+
25
  import time
26
 
27
  import pprint
28
  import logging
29
 
 
 
30
 
31
  # Configure the root logger
32
  logging.basicConfig(
 
41
  # Explicitly set the level for 'lm-eval' logger to WARNING
42
  eval_logger.setLevel(logging.WARNING)
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
46
  for i in range(10):
 
123
 
124
 
125
  def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
126
+ batch_size = 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  try:
128
  results = run_evaluation(
129
  eval_request=eval_request,
 
150
  raise
151
 
152
  # print("RESULTS", results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
155
  # print(dumped)
 
170
  repo_id=RESULTS_REPO,
171
  repo_type="dataset",
172
  )
 
 
173
  return results
174
 
175
 
 
338
 
339
  return False
340
 
341
+
342
  def process_pending_requests() -> bool:
343
  sanity_checks()
344
+
345
  current_pending_status = [PENDING_STATUS]
346
 
347
  # Get all eval request that are PENDING, if you want to run other evals, change this parameter
 
360
 
361
  eval_request = eval_requests[0]
362
  pp.pprint(eval_request)
 
 
 
 
 
 
363
 
364
  my_snapshot_download(
365
  repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
 
396
  parser = argparse.ArgumentParser(description="Run the backend")
397
  parser.add_argument("--debug", action="store_true", help="Run in debug mode")
398
  # debug parameters
399
+ parser.add_argument("--task", type=str, default="selfcheckgpt", help="Task to debug")
400
+ parser.add_argument("--model", type=str, default="facebook/opt-1.3b", help="Model to debug")
401
+ parser.add_argument("--precision", type=str, default="float16", help="Precision to debug")
402
  parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
403
  parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
 
 
 
 
404
  return parser.parse_args()
405
 
406
 
 
408
  args = get_args()
409
  local_debug = args.debug
410
  # debug specific task by ping
411
+ if local_debug:
412
+ debug_model_names = [args.model] # Use model from arguments
413
+ debug_task_name = args.task # Use task from arguments
 
 
 
 
414
  task_lst = TASKS_HARNESS.copy()
415
+ for task in task_lst:
 
416
  for debug_model_name in debug_model_names:
417
+ task_name = task.benchmark
418
+ if task_name != debug_task_name:
419
+ continue
420
+ eval_request = EvalRequest(
421
+ model=debug_model_name,
422
+ private=False,
423
+ status="",
424
+ json_filepath="",
425
+ precision=args.precision, # Use precision from arguments
426
+ inference_framework=args.inference_framework # Use inference framework from arguments
427
+ )
428
+ results = process_evaluation(task, eval_request, limit=args.limit)
429
+ else:
 
 
 
 
 
 
 
 
 
 
 
 
430
  while True:
431
  res = False
432
+
433
  # if random.randint(0, 10) == 0:
434
  res = process_pending_requests()
435
  print(f"waiting for 60 seconds")
436
  time.sleep(60)
437
+
438
  # if res is False:
439
  # if random.randint(0, 5) == 0:
440
  # res = maybe_refresh_results(100)
441
  # else:
442
  # res = process_finished_requests(100)
443
+
444
  # time.sleep(60)
445
+
446
  # if res is False:
447
  # if random.randint(0, 5) == 0:
448
  # res = maybe_refresh_results(0)
449
  # else:
450
  # res = process_finished_requests(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -4,7 +4,7 @@ APScheduler
4
  black
5
  click
6
  datasets
7
- gradio==4.26.0
8
  gradio_client
9
  huggingface-hub
10
  matplotlib
@@ -16,7 +16,7 @@ requests
16
  semantic-version
17
  tqdm
18
  wandb
19
- transformers
20
  tokenizers>=0.15.0
21
  lm_eval[ifeval] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.2
22
  accelerate
@@ -27,10 +27,6 @@ cchardet
27
  rouge_score
28
  bert-score
29
  evaluate
30
- spacy==3.7.4
31
  selfcheckgpt
32
  immutabledict
33
- gputil
34
- bitsandbytes
35
- openai
36
- scikit-learn
 
4
  black
5
  click
6
  datasets
7
+ gradio
8
  gradio_client
9
  huggingface-hub
10
  matplotlib
 
16
  semantic-version
17
  tqdm
18
  wandb
19
+ transformers>=4.36.0
20
  tokenizers>=0.15.0
21
  lm_eval[ifeval] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.2
22
  accelerate
 
27
  rouge_score
28
  bert-score
29
  evaluate
30
+ spacy
31
  selfcheckgpt
32
  immutabledict
 
 
 
 
src/backend/envs.py CHANGED
@@ -57,13 +57,10 @@ class Tasks(Enum):
57
 
58
  # task20 = Task("race", "acc", "RACE", 0)
59
  task21 = Task("mmlu", "acc", "MMLU", 5)
60
- task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
61
- # task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
62
- task24 = Task("arena_hard", "score", "Arena Hard", 0)
63
 
64
 
65
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
66
  EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
67
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
68
 
69
- DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
 
57
 
58
  # task20 = Task("race", "acc", "RACE", 0)
59
  task21 = Task("mmlu", "acc", "MMLU", 5)
 
 
 
60
 
61
 
62
  EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
63
  EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
64
  EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
65
 
66
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
src/backend/hflm_with_measurement.py CHANGED
@@ -24,7 +24,7 @@ from transformers.models.auto.modeling_auto import (
24
  MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
25
  )
26
  from transformers import TextStreamer
27
- from transformers.models.dbrx.modeling_dbrx import DbrxExpertGLU
28
  from lm_eval import utils
29
  from lm_eval.api.instance import Instance
30
  from lm_eval.api.model import TemplateLM
@@ -37,9 +37,6 @@ from lm_eval.models.utils import (
37
  stop_sequences_criteria,
38
  )
39
  from lm_eval.models.huggingface import HFLM
40
- from src.utils import get_gpu_details, get_peak_bw, transfer_precision2bytes, get_peak_flops
41
- from src.submission.check_validity import get_model_size
42
- from src.envs import API
43
 
44
 
45
  class StopWatch(TextStreamer):
@@ -60,31 +57,16 @@ class StopWatch(TextStreamer):
60
  self.start_decoding = time()
61
  self.decoding_iterations += 1
62
  return
63
-
64
  def end(self):
65
  if self.decoding_time is None and self.start_decoding is not None:
66
  self.decoding_time = time() - self.start_decoding
67
  return
68
-
69
 
70
  class HFLMWithMeasurement(HFLM):
71
  def __init__(self, **kwargs):
72
  super().__init__(**kwargs)
73
- self.pretrained = kwargs.get("pretrained", None)
74
- self.revision = kwargs.get("revision", None)
75
- self.precision = kwargs.get("dtype", None)
76
- self.num_gpus = None
77
-
78
- def _detect_num_gpus_used(self):
79
- if self.num_gpus is not None:
80
- return self.num_gpus
81
- gpus = []
82
- for p in self.model.parameters():
83
- if p.device.type == "cuda":
84
- gpus.append(p.device.index)
85
-
86
- self.num_gpus = len(set(gpus))
87
- return self.num_gpus
88
 
89
  def _loglikelihood_tokens(
90
  self,
@@ -297,7 +279,7 @@ class HFLMWithMeasurement(HFLM):
297
  # Answer: (log prob, is-exact-match)
298
  answer = (float(logits.sum()), bool(max_equal))
299
 
300
- res.append((answer, per_sample_time, 0, 0, 0, 0))
301
 
302
  self.cache_hook.add_partial("loglikelihood", request_str, answer)
303
  pbar.update(1)
@@ -305,16 +287,14 @@ class HFLMWithMeasurement(HFLM):
305
  pbar.close()
306
 
307
  return re_ord.get_original(res)
308
-
309
- def _model_generate(self, context, max_tokens, stop, **generation_kwargs):
310
  # temperature = 0.0 if not set
311
  # if do_sample is false and temp==0.0:
312
  # remove temperature, as do_sample=False takes care of this
313
  # and we don't want a warning from HF
314
  generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
315
  do_sample = generation_kwargs.get("do_sample", None)
316
-
317
- # is_gsm8k = generation_kwargs.get("is_gsm8k", False)
318
 
319
  # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
320
  if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
@@ -322,52 +302,7 @@ class HFLMWithMeasurement(HFLM):
322
 
323
  if do_sample is False and generation_kwargs.get("temperature") == 0.0:
324
  generation_kwargs.pop("temperature")
325
-
326
- # if is_gsm8k:
327
- # generation_kwargs.pop("is_gsm8k")
328
-
329
- context_length = context.shape[1]
330
-
331
- if self.model.__class__.__name__ == "MoE":
332
- model_config = self.model.model.config
333
- else:
334
- model_config = self.model.config
335
-
336
- if not self.precision:
337
- if model_config.quantization_config._load_in_4bit:
338
- self.precision = "4bit"
339
- elif model_config.quantization_config._load_in_8bit:
340
- self.precision = "8bit"
341
- else:
342
- raise ValueError("Unknown precision")
343
-
344
- # print(self.model)
345
- linear_count = 0
346
- element_wise_mul = 0
347
- for name, module in self.model.named_modules():
348
- if ('layers.0.' in name or "transformer.blocks.0" in name) and ('attn' not in name):
349
- if 'experts.0.' in name or "ffn.experts" in name:
350
- if "linear_v" in name:
351
- element_wise_mul = 1
352
- if isinstance(module, torch.nn.Linear):
353
- # print(name, module)
354
- linear_count += 1
355
- elif isinstance(module, DbrxExpertGLU):
356
- linear_count = 3
357
- element_wise_mul = 1
358
- # elif 'experts' not in name:
359
- # if ("gate" not in name and "router" not in name) or "gate_proj" in name:
360
- # if "gate_proj" in name:
361
- # element_wise_mul = 1
362
- # if isinstance(module, torch.nn.Linear):
363
- # # print(name, module)
364
- # linear_count += 1
365
- else:
366
- continue
367
- print(f"linear_count: {linear_count}")
368
- print(f"element_wise_mul: {element_wise_mul}")
369
- print(f"GPU usage: {self._detect_num_gpus_used()}")
370
-
371
  stopping_criteria = stop_sequences_criteria(
372
  self.tokenizer, stop, context.shape[1], context.shape[0]
373
  )
@@ -375,7 +310,7 @@ class HFLMWithMeasurement(HFLM):
375
  start = time()
376
  res = self.model.generate(
377
  input_ids=context,
378
- max_new_tokens=max_tokens,
379
  stopping_criteria=stopping_criteria,
380
  pad_token_id=self.tokenizer.pad_token_id,
381
  use_cache=True,
@@ -383,86 +318,15 @@ class HFLMWithMeasurement(HFLM):
383
  **generation_kwargs,
384
  )
385
  end = time()
386
-
387
  batch_size = context.shape[0]
388
  output_length = stop_watch.decoding_iterations
389
-
390
- precision_bytes = transfer_precision2bytes(self.precision)
391
-
392
- model_size_param = sum(p.numel() for p in self.model.parameters())
393
-
394
- n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else \
395
- (model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layers)
396
-
397
- d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
398
-
399
- if hasattr(model_config, "num_experts_per_tok"):
400
- n_experts_per_tok = model_config.num_experts_per_tok
401
- elif hasattr(model_config, "num_selected_experts"):
402
- n_experts_per_tok = model_config.num_selected_experts
403
- elif hasattr(model_config, "ffn_config"):
404
- n_experts_per_tok = model_config.ffn_config.moe_top_k
405
- else:
406
- n_experts_per_tok = 1
407
-
408
- if hasattr(model_config, "ffn_dim"):
409
- d_ff = model_config.ffn_dim
410
- elif hasattr(model_config, "intermediate_size"):
411
- d_ff = model_config.intermediate_size
412
- elif hasattr(model_config, "d_ff"):
413
- d_ff = model_config.d_ff
414
- elif hasattr(model_config, "ff_ratio"):
415
- d_ff = d_model * model_config.ff_ratio
416
- elif hasattr(model_config, "ffn_config"):
417
- d_ff = model_config.ffn_config.ffn_hidden_size
418
- else:
419
- raise ValueError("Unknown FFN dimension")
420
-
421
- if hasattr(model_config, "num_local_experts"):
422
- num_experts = model_config.num_local_experts
423
- elif hasattr(model_config, "num_experts"):
424
- num_experts = model_config.num_experts
425
- elif hasattr(model_config, "ffn_config"):
426
- num_experts = model_config.ffn_config.moe_num_experts
427
- else:
428
- num_experts = 1
429
-
430
- ffn_params = n_layers * d_ff * linear_count * d_model
431
-
432
- shared_params = model_size_param - num_experts * ffn_params
433
-
434
- model_size = shared_params + n_experts_per_tok * ffn_params
435
-
436
- per_token_kv_size = 2 * n_layers * d_model * precision_bytes
437
-
438
- peak_bw_single = get_peak_bw(get_gpu_details())
439
- peak_bw = peak_bw_single * self._detect_num_gpus_used()
440
-
441
- context_prefill_size = context_length
442
- kv_size = context_prefill_size * per_token_kv_size + (output_length - 1) * per_token_kv_size / 2
443
-
444
- kv_size = kv_size / 1e9
445
-
446
- n_vocab = model_config.vocab_size
447
 
448
  end_to_end_time = (end - start) / batch_size
449
  prefilling_time = stop_watch.prefilling_time / batch_size
450
  decoding_time = stop_watch.decoding_time / batch_size
451
  token_per_sec = output_length / decoding_time
452
- achieve_mem_bw = (model_size * precision_bytes / 1e9 + kv_size) * token_per_sec
453
-
454
- avg_context_length = context_length + (output_length - 1) / 2
455
- flops_per_token = 2 * model_size + ((linear_count + element_wise_mul) * n_layers * avg_context_length * d_model) + 4 * d_model + 2 * d_model * n_vocab
456
- peak_flops_single = get_peak_flops(get_gpu_details(), self.precision)
457
- peak_flops = peak_flops_single * self._detect_num_gpus_used()
458
-
459
- ## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial
460
- mfu = token_per_sec * flops_per_token / peak_flops
461
- mbu = achieve_mem_bw / peak_bw
462
-
463
- print(f"mfu: {mfu}, mbu: {mbu}")
464
-
465
- return res, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu
466
 
467
  def generate_until(
468
  self, requests: List[Instance], disable_tqdm: bool = False
@@ -539,19 +403,11 @@ class HFLMWithMeasurement(HFLM):
539
  f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
540
  )
541
  # add EOS token to stop sequences
542
- eos = "<|eot_id|>"
543
  if not until:
544
  until = [eos]
545
  else:
546
  until.append(eos)
547
-
548
- # is_gsm8k = kwargs.get("is_gsm8k", False)
549
- # if is_gsm8k:
550
- # until = ["Question:", "Question", "</s>"]
551
- # eos_ids = [self.tokenizer.eos_token_id,
552
- # self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
553
-
554
-
555
  if "max_gen_toks" in kwargs.keys():
556
  max_gen_toks = kwargs.pop("max_gen_toks")
557
  else:
@@ -571,16 +427,14 @@ class HFLMWithMeasurement(HFLM):
571
  left_truncate_len=max_ctx_len,
572
  truncation=self.truncation,
573
  )
574
-
575
- # print("context: ", self.tok_decode(context_enc[0]))
576
  context_enc = context_enc.to(self.device)
577
  attn_masks = attn_masks.to(self.device)
578
 
579
- if "max_tokens" not in kwargs:
580
- kwargs["max_tokens"] = max_gen_toks
581
 
582
  # perform batched generation
583
- cont, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu = self._model_generate(
584
  context=context_enc,
585
  attention_mask=attn_masks,
586
  stop=until,
@@ -591,21 +445,18 @@ class HFLMWithMeasurement(HFLM):
591
  for cont_toks, context in zip(cont_toks_list, contexts):
592
  # discard context + left-padding toks if using causal decoder-only LM
593
  if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
594
- # print("After Generation: ", self.tok_decode(cont_toks))
595
  cont_toks = cont_toks[context_enc.shape[1] :]
596
-
597
  s = self.tok_decode(cont_toks)
598
 
599
- # # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
600
- # if not is_gsm8k:
601
  for term in until:
602
  if len(term) > 0:
603
  # ignore '' separator,
604
  # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
605
  s = s.split(term)[0]
606
-
607
- # print(s)
608
- res.append((s, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu))
609
 
610
  self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
611
  pbar.update(1)
 
24
  MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
25
  )
26
  from transformers import TextStreamer
27
+
28
  from lm_eval import utils
29
  from lm_eval.api.instance import Instance
30
  from lm_eval.api.model import TemplateLM
 
37
  stop_sequences_criteria,
38
  )
39
  from lm_eval.models.huggingface import HFLM
 
 
 
40
 
41
 
42
  class StopWatch(TextStreamer):
 
57
  self.start_decoding = time()
58
  self.decoding_iterations += 1
59
  return
60
+
61
  def end(self):
62
  if self.decoding_time is None and self.start_decoding is not None:
63
  self.decoding_time = time() - self.start_decoding
64
  return
65
+
66
 
67
  class HFLMWithMeasurement(HFLM):
68
  def __init__(self, **kwargs):
69
  super().__init__(**kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  def _loglikelihood_tokens(
72
  self,
 
279
  # Answer: (log prob, is-exact-match)
280
  answer = (float(logits.sum()), bool(max_equal))
281
 
282
+ res.append((answer, per_sample_time, 0, 0))
283
 
284
  self.cache_hook.add_partial("loglikelihood", request_str, answer)
285
  pbar.update(1)
 
287
  pbar.close()
288
 
289
  return re_ord.get_original(res)
290
+
291
+ def _model_generate(self, context, max_length, stop, **generation_kwargs):
292
  # temperature = 0.0 if not set
293
  # if do_sample is false and temp==0.0:
294
  # remove temperature, as do_sample=False takes care of this
295
  # and we don't want a warning from HF
296
  generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
297
  do_sample = generation_kwargs.get("do_sample", None)
 
 
298
 
299
  # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
300
  if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
 
302
 
303
  if do_sample is False and generation_kwargs.get("temperature") == 0.0:
304
  generation_kwargs.pop("temperature")
305
+ # build stopping criteria
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  stopping_criteria = stop_sequences_criteria(
307
  self.tokenizer, stop, context.shape[1], context.shape[0]
308
  )
 
310
  start = time()
311
  res = self.model.generate(
312
  input_ids=context,
313
+ max_length=max_length,
314
  stopping_criteria=stopping_criteria,
315
  pad_token_id=self.tokenizer.pad_token_id,
316
  use_cache=True,
 
318
  **generation_kwargs,
319
  )
320
  end = time()
321
+
322
  batch_size = context.shape[0]
323
  output_length = stop_watch.decoding_iterations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  end_to_end_time = (end - start) / batch_size
326
  prefilling_time = stop_watch.prefilling_time / batch_size
327
  decoding_time = stop_watch.decoding_time / batch_size
328
  token_per_sec = output_length / decoding_time
329
+ return res, end_to_end_time, prefilling_time, token_per_sec
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  def generate_until(
332
  self, requests: List[Instance], disable_tqdm: bool = False
 
403
  f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
404
  )
405
  # add EOS token to stop sequences
406
+ eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
407
  if not until:
408
  until = [eos]
409
  else:
410
  until.append(eos)
 
 
 
 
 
 
 
 
411
  if "max_gen_toks" in kwargs.keys():
412
  max_gen_toks = kwargs.pop("max_gen_toks")
413
  else:
 
427
  left_truncate_len=max_ctx_len,
428
  truncation=self.truncation,
429
  )
 
 
430
  context_enc = context_enc.to(self.device)
431
  attn_masks = attn_masks.to(self.device)
432
 
433
+ if "max_length" not in kwargs:
434
+ kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
435
 
436
  # perform batched generation
437
+ cont, end_to_end_time, prefilling_time, token_per_sec = self._model_generate(
438
  context=context_enc,
439
  attention_mask=attn_masks,
440
  stop=until,
 
445
  for cont_toks, context in zip(cont_toks_list, contexts):
446
  # discard context + left-padding toks if using causal decoder-only LM
447
  if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
 
448
  cont_toks = cont_toks[context_enc.shape[1] :]
449
+
450
  s = self.tok_decode(cont_toks)
451
 
452
+ # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
 
453
  for term in until:
454
  if len(term) > 0:
455
  # ignore '' separator,
456
  # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
457
  s = s.split(term)[0]
458
+
459
+ res.append((s, end_to_end_time, prefilling_time, token_per_sec))
 
460
 
461
  self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
462
  pbar.update(1)
src/backend/manage_requests.py CHANGED
@@ -27,24 +27,24 @@ class EvalRequest:
27
  likes: Optional[int] = 0
28
  params: Optional[int] = None
29
  license: Optional[str] = ""
30
- batch_size: Optional[int] = 1
31
- gpu_type: Optional[str] = "NVIDIA-A100-PCIe-80GB"
32
 
33
  def get_model_args(self) -> str:
34
  model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
35
- model_args += ",trust_remote_code=True,device_map=auto"
36
  if self.precision in ["float16", "float32", "bfloat16"]:
37
  model_args += f",dtype={self.precision}"
38
  # Quantized models need some added config, the install of bits and bytes, etc
39
  # elif self.precision == "8bit":
40
  # model_args += ",load_in_8bit=True"
41
- elif self.precision == "4bit":
42
- model_args += ",load_in_4bit=True"
43
  # elif self.precision == "GPTQ":
44
  # A GPTQ model does not need dtype to be specified,
45
  # it will be inferred from the config
 
46
  elif self.precision == "8bit":
47
  model_args += ",load_in_8bit=True"
 
48
  else:
49
  raise Exception(f"Unknown precision {self.precision}.")
50
  return model_args
 
27
  likes: Optional[int] = 0
28
  params: Optional[int] = None
29
  license: Optional[str] = ""
 
 
30
 
31
  def get_model_args(self) -> str:
32
  model_args = f"pretrained={self.model},revision={self.revision},parallelize=True" # ,max_length=4096"
33
+
34
  if self.precision in ["float16", "float32", "bfloat16"]:
35
  model_args += f",dtype={self.precision}"
36
  # Quantized models need some added config, the install of bits and bytes, etc
37
  # elif self.precision == "8bit":
38
  # model_args += ",load_in_8bit=True"
39
+ # elif self.precision == "4bit":
40
+ # model_args += ",load_in_4bit=True"
41
  # elif self.precision == "GPTQ":
42
  # A GPTQ model does not need dtype to be specified,
43
  # it will be inferred from the config
44
+ pass
45
  elif self.precision == "8bit":
46
  model_args += ",load_in_8bit=True"
47
+ model_args += ",trust_remote_code=True"
48
  else:
49
  raise Exception(f"Unknown precision {self.precision}.")
50
  return model_args
src/backend/moe_infinity.py CHANGED
@@ -31,20 +31,15 @@ class MoEHFLM(HFLMWithMeasurement):
31
  self.use_chat_template = use_chat_template
32
  if "device" in kwargs:
33
  kwargs.pop("device")
34
- if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")):
35
- shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
36
- kwargs["device_map"] = "cuda:0"
37
  super().__init__(
38
- *args, **kwargs, pretrained=pretrained
39
  ) # Assuming HFLM accepts a 'pretrained' arg and handles it
40
  # self._create_model()
 
41
 
42
  def __del__(self):
43
- self._model.engine.clean_up() # clean up hooks
44
- self._model.engine.archer_engine.clean_up_resources() # clean up resources
45
- if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")):
46
- shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads")) # clean up offload model
47
-
48
 
49
  def _create_model(self, *args, **kwargs):
50
  """
 
31
  self.use_chat_template = use_chat_template
32
  if "device" in kwargs:
33
  kwargs.pop("device")
 
 
 
34
  super().__init__(
35
+ *args, **kwargs, pretrained=pretrained, device_map="cuda:0"
36
  ) # Assuming HFLM accepts a 'pretrained' arg and handles it
37
  # self._create_model()
38
+ shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
39
 
40
  def __del__(self):
41
+ # Clean up offloaded models from self.offload_path
42
+ shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
 
 
 
43
 
44
  def _create_model(self, *args, **kwargs):
45
  """
src/backend/run_eval_suite.py CHANGED
@@ -13,20 +13,16 @@ orig_higher_is_better = ConfigurableTask.higher_is_better
13
  def process_results_decorator(func):
14
  def wrapper(self, doc, results, *args, **kwargs):
15
  processed_results = [r[0] for r in results]
16
-
17
  end_to_end_time = sum([r[1] for r in results]) / len(results)
18
  prefilling_time = sum([r[2] for r in results]) / len(results)
19
  decoding_throughput = sum([r[3] for r in results]) / len(results)
20
- mfu = sum([r[4] for r in results]) / len(results)
21
- mbu = sum([r[5] for r in results]) / len(results)
22
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
23
 
24
  result_dict = func(self, doc, processed_results, *args, **kwargs)
25
  result_dict["end_to_end_time"] = end_to_end_time
26
  result_dict["prefilling_time"] = prefilling_time
27
  result_dict["decoding_throughput"] = decoding_throughput
28
- result_dict["mfu"] = mfu
29
- result_dict["mbu"] = mbu
30
  return result_dict
31
  return wrapper
32
  ConfigurableTask.process_results = process_results_decorator(orig_process_results)
@@ -37,8 +33,6 @@ def aggregation_decorator(func):
37
  aggregation_list["end_to_end_time"] = mean
38
  aggregation_list["prefilling_time"] = mean
39
  aggregation_list["decoding_throughput"] = mean
40
- aggregation_list["mfu"] = mean
41
- aggregation_list["mbu"] = mean
42
  return aggregation_list
43
  return wrapper
44
  ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
@@ -49,8 +43,6 @@ def higher_is_better_decorator(func):
49
  higher_is_better_dict["end_to_end_time"] = False
50
  higher_is_better_dict["prefilling_time"] = False
51
  higher_is_better_dict["decoding_throughput"] = True
52
- higher_is_better_dict["mfu"] = True
53
- higher_is_better_dict["mbu"] = True
54
  return higher_is_better_dict
55
  return wrapper
56
  ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)
 
13
  def process_results_decorator(func):
14
  def wrapper(self, doc, results, *args, **kwargs):
15
  processed_results = [r[0] for r in results]
16
+
17
  end_to_end_time = sum([r[1] for r in results]) / len(results)
18
  prefilling_time = sum([r[2] for r in results]) / len(results)
19
  decoding_throughput = sum([r[3] for r in results]) / len(results)
 
 
20
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
21
 
22
  result_dict = func(self, doc, processed_results, *args, **kwargs)
23
  result_dict["end_to_end_time"] = end_to_end_time
24
  result_dict["prefilling_time"] = prefilling_time
25
  result_dict["decoding_throughput"] = decoding_throughput
 
 
26
  return result_dict
27
  return wrapper
28
  ConfigurableTask.process_results = process_results_decorator(orig_process_results)
 
33
  aggregation_list["end_to_end_time"] = mean
34
  aggregation_list["prefilling_time"] = mean
35
  aggregation_list["decoding_throughput"] = mean
 
 
36
  return aggregation_list
37
  return wrapper
38
  ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
 
43
  higher_is_better_dict["end_to_end_time"] = False
44
  higher_is_better_dict["prefilling_time"] = False
45
  higher_is_better_dict["decoding_throughput"] = True
 
 
46
  return higher_is_better_dict
47
  return wrapper
48
  ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)
src/backend/tasks/arena_hard/__init__.py DELETED
File without changes
src/backend/tasks/arena_hard/arena_hard.yaml DELETED
@@ -1,2 +0,0 @@
1
- task: arena_hard
2
- class: !function task.ArenaHard
 
 
 
src/backend/tasks/arena_hard/arena_judgment.py DELETED
@@ -1,256 +0,0 @@
1
- '''
2
- This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
3
- under the Apache 2.0 License from the arena-hard project.
4
- (https://github.com/lm-sys/arena-hard)
5
- Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
6
- See the NOTICE file distributed with this work for additional
7
- information regarding copyright ownership.
8
- '''
9
-
10
- import pandas as pd
11
- from tqdm import tqdm
12
- import numpy as np
13
- from sklearn.linear_model import LogisticRegression
14
- import math
15
- from collections import defaultdict
16
- from tqdm import tqdm
17
-
18
- from src.backend.tasks.arena_hard.arena_utils import (
19
- chat_completion_openai,
20
- load_questions,
21
- load_model_answers,
22
- get_endpoint,
23
- make_config,
24
- )
25
-
26
-
27
- def get_score(judgment, pattern, pairwise=True):
28
- matches = pattern.findall(judgment)
29
- matches = [m for m in matches if m != ""]
30
- if len(set(matches)) == 0:
31
- return None, True
32
- elif len(set(matches)) == 1:
33
- if pairwise:
34
- return matches[0].strip("\n"), False
35
- return int(matches[0])
36
- else:
37
- return None, False
38
-
39
-
40
- # get answer from model
41
- def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None):
42
- api_dict = get_endpoint(endpoint_dict["endpoints"])
43
-
44
- # if endpoint_dict["api_type"] == "anthropic":
45
- # output = chat_completion_anthropic(model, conv, temperature, max_tokens)
46
- # elif endpoint_dict["api_type"] == "azure":
47
- # output = chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict)
48
-
49
- output = chat_completion_openai(model, conv, temperature, max_tokens, api_dict)
50
- return output
51
-
52
-
53
- def judgment(**args):
54
- question = args["question"]
55
- answer = args["answer"]
56
- reference = args["reference"]
57
- baseline = args["baseline_answer"]
58
- configs = args["configs"]
59
- # output_file = args["output_file"]
60
- model = configs["judge_model"]
61
-
62
- num_games = 2 if configs["pairwise"] else 1
63
-
64
- # output = {
65
- # "question_id":question["question_id"],
66
- # "judge": model,
67
- # "model": "custom_model",
68
- # "games":[]
69
- # }
70
- output = [question["question_id"]]
71
-
72
- for game in range(num_games):
73
- conv = [{"role": "system", "content": configs["system_prompt"]}]
74
-
75
- for template in configs["prompt_template"]:
76
- prompt_args = {}
77
-
78
- prompt_args[f"question_{1}"] = question["content"]
79
- base = 1
80
-
81
- if baseline:
82
- if game % 2 == 1: # swap position
83
- temp = baseline
84
- baseline = answer
85
- answer = temp
86
-
87
- if game == 0:
88
- for i, turn in enumerate(baseline["choices"][0]["turns"]):
89
- prompt_args[f"answer_{i+1}"] = turn["content"]
90
- base += 1
91
-
92
- if game == 1:
93
- prompt_args[f"answer_{1}"] = baseline
94
- base += 1
95
-
96
- if answer:
97
- prompt_args[f"answer_{base}"] = answer
98
-
99
- if reference:
100
- for j, ref_answer in enumerate(reference):
101
- for i, turn in enumerate(ref_answer["choices"][0]["turns"]):
102
- prompt_args[f"ref_answer_{i+j+1}"] = turn["content"]
103
-
104
- user_prompt = template.format(**prompt_args)
105
- conv.append({"role": "user", "content": user_prompt})
106
-
107
- judgment = ""
108
- for _ in range(2):
109
- new_judgment = get_answer(
110
- model,
111
- conv,
112
- configs["temperature"],
113
- configs["max_tokens"],
114
- args["endpoint_dict"],
115
- )
116
-
117
- judgment += ("\n" + new_judgment)
118
-
119
- score, try_again = get_score(judgment, args["regex_pattern"])
120
-
121
- conv.append({"role": "assistant", "content": new_judgment})
122
-
123
- if not try_again:
124
- break
125
-
126
- conv.append({"role": "user", "content": "continue your judgment and finish by outputting a final verdict label"})
127
- print("Finish judgment!!!")
128
- # result = {
129
- # "user_prompt": conv[1]["content"],
130
- # "judgment": judgment,
131
- # "score":score
132
- # }
133
- output.append(score)
134
-
135
- return output
136
-
137
- def get_battles_from_scores(score_list, first_game_only=False, WEIGHT=3):
138
- arena_hard_battles = pd.DataFrame()
139
-
140
- print("Turning score list into battles...")
141
-
142
- for scores in tqdm(score_list):
143
- question_id, score1, score2 = scores
144
-
145
- # Process game 1
146
- output = {"question_id": question_id,
147
- "model_a": "gpt-4-0314",
148
- "model_b": f"custom_model"} # Unique identifier for model
149
- weight = 1
150
- if score1 == "A=B":
151
- output["winner"] = "tie"
152
- elif score1 == "A>B":
153
- output["winner"] = "model_a"
154
- elif score1 == "A>>B":
155
- output["winner"] = "model_a"
156
- weight = WEIGHT
157
- elif score1 == "B>A":
158
- output["winner"] = "model_b"
159
- elif score1 == "B>>A":
160
- output["winner"] = "model_b"
161
- weight = WEIGHT
162
- else:
163
- weight = 0
164
-
165
- if weight:
166
- arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
167
-
168
- if not first_game_only:
169
- # Process game 2
170
- output = {"question_id": question_id,
171
- "model_a": "gpt-4-0314",
172
- "model_b": f"custom_model"} # Unique identifier for model
173
- weight = 1
174
- if score2 == "A=B":
175
- output["winner"] = "tie"
176
- elif score2 == "A>B":
177
- output["winner"] = "model_b"
178
- elif score2 == "A>>B":
179
- output["winner"] = "model_b"
180
- weight = WEIGHT
181
- elif score2 == "B>A":
182
- output["winner"] = "model_a"
183
- elif score2 == "B>>A":
184
- output["winner"] = "model_a"
185
- weight = WEIGHT
186
- else:
187
- weight = 0
188
-
189
- if weight:
190
- arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
191
-
192
- arena_hard_battles.to_json("./arena_hard_battles.jsonl", lines=True, orient="records")
193
- return arena_hard_battles
194
-
195
- def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
196
- models = pd.concat([df["model_a"], df["model_b"]]).unique()
197
- models = pd.Series(np.arange(len(models)), index=models)
198
-
199
- LOW_RATING = 100
200
- # duplicate battles
201
- df = pd.concat([df, df], ignore_index=True)
202
- p = len(models.index)
203
- n = df.shape[0]
204
-
205
- X = np.zeros([n, p])
206
- X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
207
- X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
208
-
209
- # one A win => two A win
210
- Y = np.zeros(n)
211
- Y[df["winner"] == "model_a"] = 1.0
212
-
213
- # one tie => one A win + one B win
214
- # find tie + tie (both bad) index
215
- tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
216
- tie_idx[len(tie_idx)//2:] = False
217
- Y[tie_idx] = 1.0
218
-
219
- if len(np.unique(Y)) == 1:
220
- # If there's only one class in the data, assign default ratings
221
- elo_scores = np.full(p, LOW_RATING)
222
- elo_scores[models["gpt-4-0314"]] = INIT_RATING
223
- else:
224
- lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
225
- lr.fit(X,Y)
226
-
227
- elo_scores = SCALE * lr.coef_[0] + INIT_RATING
228
-
229
- # set anchor as gpt-4-0314 = 1000
230
- if "gpt-4-0314" in models.index:
231
- elo_scores += 1000 - elo_scores[models["gpt-4-0314"]]
232
- return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)
233
-
234
- def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
235
- names = sorted(list(elo_ratings.keys()))
236
- wins = defaultdict(lambda: defaultdict(lambda: 0))
237
- for a in names:
238
- for b in names:
239
- ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
240
- wins[a][b] = ea
241
- wins[b][a] = 1 - ea
242
-
243
- data = {
244
- a: [wins[a][b] if a != b else np.NAN for b in names]
245
- for a in names
246
- }
247
-
248
- df = pd.DataFrame(data, index=names)
249
- df.index.name = "model_a"
250
- df.columns.name = "model_b"
251
- return df.T
252
-
253
- def get_win_rate_column(df, column, baseline="gpt-4-0314"):
254
- to_dict = df[["model", column]].set_index("model").to_dict()[column]
255
- win_rate_table = predict_win_rate(to_dict)
256
- return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/tasks/arena_hard/arena_utils.py DELETED
@@ -1,349 +0,0 @@
1
- '''
2
- This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
3
- under the Apache 2.0 License from the arena-hard project.
4
- (https://github.com/lm-sys/arena-hard)
5
- Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
6
- See the NOTICE file distributed with this work for additional
7
- information regarding copyright ownership.
8
- '''
9
-
10
-
11
- import os
12
- import json
13
- import time
14
- import yaml
15
- import random
16
-
17
- from typing import Optional
18
- from glob import glob
19
-
20
- # API setting constants
21
- API_MAX_RETRY = 16
22
- API_RETRY_SLEEP = 10
23
- API_ERROR_OUTPUT = "$ERROR$"
24
-
25
-
26
- OPENAI_MODEL_LIST = (
27
- "gpt-3.5-turbo",
28
- "gpt-3.5-turbo-0301",
29
- "gpt-3.5-turbo-0613",
30
- "gpt-3.5-turbo-0613-verbose",
31
- "gpt-3.5-turbo-1106",
32
- "gpt-3.5-turbo-0125",
33
- "gpt-4",
34
- "gpt-4-0314",
35
- "gpt-4-0613",
36
- "gpt-4-turbo",
37
- "gpt-4-1106-preview",
38
- "gpt-4-0125-preview",
39
- )
40
-
41
-
42
- temperature_config = {
43
- "writing": 0.7,
44
- "roleplay": 0.7,
45
- "extraction": 0.0,
46
- "math": 0.0,
47
- "coding": 0.0,
48
- "reasoning": 0.0,
49
- "stem": 0.1,
50
- "humanities": 0.1,
51
- }
52
-
53
-
54
- def load_questions(question_file: str):
55
- """Load questions from a file."""
56
- questions = []
57
- with open(question_file, "r") as ques_file:
58
- for line in ques_file:
59
- if line:
60
- questions.append(json.loads(line))
61
- return questions
62
-
63
-
64
- def load_model_answers(answer_dir: str):
65
- """Load model answers.
66
-
67
- The return value is a python dict of type:
68
- Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
69
- """
70
- filenames = glob(os.path.join(answer_dir, "*.jsonl"))
71
- filenames.sort()
72
- model_answers = {}
73
-
74
- for filename in filenames:
75
- model_name = os.path.basename(filename)[:-6]
76
- answer = {}
77
- with open(filename) as fin:
78
- for line in fin:
79
- line = json.loads(line)
80
- answer[line["question_id"]] = line
81
- model_answers[model_name] = answer
82
-
83
- return model_answers
84
-
85
-
86
- def get_endpoint(endpoint_list):
87
- if endpoint_list is None:
88
- return None
89
- assert endpoint_list is not None
90
- # randomly pick one
91
- api_dict = random.choices(
92
- endpoint_list
93
- )[0]
94
- return api_dict
95
-
96
-
97
- # load config args from config yaml files
98
- def make_config(config_file: str) -> dict:
99
- config_kwargs = {}
100
- with open(config_file, "r") as f:
101
- config_kwargs = yaml.load(f, Loader=yaml.SafeLoader)
102
-
103
- return config_kwargs
104
-
105
-
106
- def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
107
- import openai
108
- if api_dict:
109
- client = openai.OpenAI(
110
- base_url=api_dict["api_base"],
111
- api_key=api_dict["api_key"],
112
- )
113
- else:
114
- client = openai.OpenAI()
115
-
116
- output = API_ERROR_OUTPUT
117
- for _ in range(API_MAX_RETRY):
118
- try:
119
- # print(messages)
120
- completion = client.chat.completions.create(
121
- model=model,
122
- messages=messages,
123
- temperature=temperature,
124
- max_tokens=max_tokens
125
- )
126
- output = completion.choices[0].message.content
127
- break
128
- except openai.RateLimitError as e:
129
- print(type(e), e)
130
- time.sleep(API_RETRY_SLEEP)
131
- except openai.BadRequestError as e:
132
- print(messages)
133
- print(type(e), e)
134
- except KeyError:
135
- print(type(e), e)
136
- break
137
-
138
- return output
139
-
140
-
141
- # def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None):
142
- # import openai
143
- # from openai import AzureOpenAI
144
-
145
- # api_base = api_dict["api_base"]
146
- # client = AzureOpenAI(
147
- # azure_endpoint = api_base,
148
- # api_key= api_dict["api_key"],
149
- # api_version=api_dict["api_version"],
150
- # timeout=240,
151
- # max_retries=2
152
- # )
153
-
154
- # output = API_ERROR_OUTPUT
155
- # for _ in range(API_MAX_RETRY):
156
- # try:
157
- # response = client.chat.completions.create(
158
- # model=model,
159
- # messages=messages,
160
- # n=1,
161
- # temperature=temperature,
162
- # max_tokens=max_tokens,
163
- # seed=42,
164
- # )
165
- # output = response.choices[0].message.content
166
- # break
167
- # except openai.RateLimitError as e:
168
- # print(type(e), e)
169
- # time.sleep(API_RETRY_SLEEP)
170
- # except openai.BadRequestError as e:
171
- # print(type(e), e)
172
- # break
173
- # except KeyError:
174
- # print(type(e), e)
175
- # break
176
-
177
- # return output
178
-
179
-
180
- # def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict=None):
181
- # import anthropic
182
-
183
- # if api_dict:
184
- # api_key = api_dict["api_key"]
185
- # else:
186
- # api_key = os.environ["ANTHROPIC_API_KEY"]
187
-
188
- # sys_msg = ""
189
- # if messages[0]["role"] == "system":
190
- # sys_msg = messages[0]["content"]
191
- # messages = messages[1:]
192
-
193
- # output = API_ERROR_OUTPUT
194
- # for _ in range(API_MAX_RETRY):
195
- # try:
196
- # # print(sys_msg)
197
- # c = anthropic.Anthropic(api_key=api_key)
198
- # response = c.messages.create(
199
- # model=model,
200
- # messages=messages,
201
- # stop_sequences=[anthropic.HUMAN_PROMPT],
202
- # max_tokens=max_tokens,
203
- # temperature=temperature,
204
- # system=sys_msg
205
- # )
206
- # output = response.content[0].text
207
- # break
208
- # except anthropic.APIError as e:
209
- # print(type(e), e)
210
- # time.sleep(API_RETRY_SLEEP)
211
- # return output
212
-
213
-
214
- # def chat_completion_mistral(model, messages, temperature, max_tokens):
215
- # from mistralai.client import MistralClient
216
- # from mistralai.models.chat_completion import ChatMessage
217
- # from mistralai.exceptions import MistralException
218
-
219
- # api_key = os.environ["MISTRAL_API_KEY"]
220
- # client = MistralClient(api_key=api_key)
221
-
222
- # prompts = [ChatMessage(role=message["role"], content=message["content"]) for message in messages]
223
-
224
- # output = API_ERROR_OUTPUT
225
- # for _ in range(API_MAX_RETRY):
226
- # try:
227
- # chat_response = client.chat(
228
- # model=model,
229
- # messages=prompts,
230
- # temperature=temperature,
231
- # max_tokens=max_tokens,
232
- # )
233
- # output = chat_response.choices[0].message.content
234
- # break
235
- # except MistralException as e:
236
- # print(type(e), e)
237
- # break
238
-
239
- # return output
240
-
241
-
242
- # def chat_completion_gemini(model, messages, temperature, max_tokens):
243
- # import google.generativeai as genai
244
- # genai.configure(api_key=os.environ["GEMINI_API_KEY"])
245
-
246
- # safety_settings = [
247
- # {
248
- # "category": "HARM_CATEGORY_HARASSMENT",
249
- # "threshold": "BLOCK_NONE"
250
- # },
251
- # {
252
- # "category": "HARM_CATEGORY_HATE_SPEECH",
253
- # "threshold": "BLOCK_NONE"
254
- # },
255
- # {
256
- # "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
257
- # "threshold": "BLOCK_NONE"
258
- # },
259
- # {
260
- # "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
261
- # "threshold": "BLOCK_NONE"
262
- # },
263
- # ]
264
-
265
- # # Set up the model
266
- # generation_config = {
267
- # "temperature": temperature,
268
- # "top_p": 1,
269
- # "top_k": 1,
270
- # "max_output_tokens": max_tokens,
271
- # }
272
-
273
- # output = API_ERROR_OUTPUT
274
- # for _ in range(API_MAX_RETRY):
275
- # try:
276
- # gemini = genai.GenerativeModel(
277
- # model_name=model,
278
- # generation_config=generation_config,
279
- # safety_settings=safety_settings)
280
-
281
- # convo = gemini.start_chat(history=[])
282
-
283
- # convo.send_message(messages)
284
- # output = convo.last.text
285
- # break
286
- # except genai.types.generation_types.StopCandidateException as e:
287
- # print(type(e), e)
288
- # break
289
- # except Exception as e:
290
- # print(type(e), e)
291
- # time.sleep(API_RETRY_SLEEP)
292
-
293
- # return output
294
-
295
-
296
- # def chat_completion_cohere(model, messages, temperature, max_tokens):
297
- # import cohere
298
-
299
- # co = cohere.Client(os.environ["COHERE_API_KEY"])
300
- # assert len(messages) > 0
301
-
302
- # template_map = {"system":"SYSTEM",
303
- # "assistant":"CHATBOT",
304
- # "user":"USER"}
305
-
306
- # assert messages[-1]["role"] == "user"
307
- # prompt = messages[-1]["content"]
308
-
309
- # if len(messages) > 1:
310
- # history = []
311
- # for message in messages[:-1]:
312
- # history.append({"role":template_map[message["role"]], "message":message["content"]})
313
- # else:
314
- # history = None
315
-
316
- # output = API_ERROR_OUTPUT
317
- # for _ in range(API_MAX_RETRY):
318
- # try:
319
- # response = co.chat(
320
- # message=prompt,
321
- # model=model,
322
- # temperature=temperature,
323
- # max_tokens=max_tokens,
324
- # chat_history=history,
325
- # )
326
- # output = response.text
327
- # break
328
- # except cohere.core.api_error.ApiError as e:
329
- # print(type(e), e)
330
- # raise
331
- # except Exception as e:
332
- # print(type(e), e)
333
- # break
334
-
335
- # return output
336
-
337
-
338
- def reorg_answer_file(answer_file):
339
- """Sort by question id and de-duplication"""
340
- answers = {}
341
- with open(answer_file, "r") as fin:
342
- for l in fin:
343
- qid = json.loads(l)["question_id"]
344
- answers[qid] = l
345
-
346
- qids = sorted(list(answers.keys()))
347
- with open(answer_file, "w") as fout:
348
- for qid in qids:
349
- fout.write(answers[qid])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/tasks/arena_hard/configs/api_config.yaml DELETED
@@ -1,17 +0,0 @@
1
- # gpt-3.5-turbo:
2
- # model_name: gpt-3.5-turbo
3
- # endpoints: null
4
- # api_type: openai
5
- # parallel: 8
6
-
7
- gpt-4-1106-preview:
8
- model_name: gpt-4-1106-preview
9
- endpoints: null
10
- api_type: openai
11
- parallel: 8
12
-
13
- # llama3-7b:
14
- # model_name: llama3-7b
15
- # endpoints: null
16
- # api_type: openai
17
- # parallel: 8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/tasks/arena_hard/configs/judge_config.yaml DELETED
@@ -1,26 +0,0 @@
1
- name: judgment config file for Arena Hard
2
-
3
- bench_name: arena-hard-v0.1
4
-
5
- # Arena Hard default
6
- judge_model: gpt-4-1106-preview
7
- # judge_model: gpt-3.5-turbo
8
- reference: False # Optional
9
- ref_model: null
10
-
11
- baseline: True
12
- baseline_model: gpt-4-0314
13
-
14
- pairwise: True
15
- temperature: 0
16
- max_tokens: 4096
17
-
18
- regex_pattern: \[\[([AB<>=]+)\]\]
19
-
20
- system_prompt: "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
21
-
22
- prompt_template: ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"]
23
-
24
- # Add your model below for evaluation
25
- # model_list:
26
- # - gpt-3.5-turbo-0125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
src/backend/tasks/arena_hard/question.jsonl DELETED
The diff for this file is too large to render. See raw diff
 
src/backend/tasks/arena_hard/task.py DELETED
@@ -1,220 +0,0 @@
1
- import os
2
- from typing import Union, List
3
-
4
- from lm_eval.api.task import ConfigurableTask
5
- from lm_eval.api.instance import Instance
6
-
7
- # from lm_eval.api.registry import register_task
8
- from lm_eval.api.metrics import mean
9
-
10
- from src.backend.envs import DEVICE
11
-
12
- import pandas as pd
13
-
14
- from src.backend.tasks.measurement_task_utils import measure_system_metrics
15
- import json
16
-
17
- from typing import (
18
- Any,
19
- Dict,
20
- List,
21
- Optional,
22
- Union,
23
- )
24
-
25
- from datasets import Dataset
26
- import re
27
-
28
- from src.backend.tasks.arena_hard.arena_utils import (
29
- load_questions,
30
- load_questions,
31
- load_model_answers,
32
- make_config,
33
- )
34
-
35
- from src.backend.tasks.arena_hard.arena_judgment import (
36
- judgment,
37
- get_battles_from_scores,
38
- compute_mle_elo,
39
- predict_win_rate,
40
- get_win_rate_column
41
- )
42
-
43
- def load_questions(question_file: str):
44
- """Load questions from a file."""
45
- questions = []
46
- with open(question_file, "r") as ques_file:
47
- for line in ques_file:
48
- if line:
49
- questions.append(json.loads(line))
50
- return questions
51
-
52
- def download_wrapper(func):
53
- def download(self, *args, **kwargs):
54
- print("Using Arena Hard, No need to download")
55
- return download
56
-
57
- original_download = ConfigurableTask.download
58
- ConfigurableTask.download = download_wrapper(original_download)
59
- # @register_task("selfcheckgpt")
60
- @measure_system_metrics
61
- class ArenaHard(ConfigurableTask):
62
- VERSION = 0.0
63
- OUTPUT_TYPE = "generate_until"
64
- data_path = os.path.join(os.path.dirname(__file__), 'question.jsonl')
65
- judge_config_path = os.path.join(os.path.dirname(__file__), "configs/judge_config.yaml")
66
- configs = make_config(judge_config_path)
67
- model_ans_dir = os.path.join(os.path.dirname(__file__), "model_answer")
68
- model_answers = load_model_answers(model_ans_dir)
69
- data = load_questions(data_path)
70
-
71
- def __init__(self):
72
- super().__init__(config={"metadata": {"version": self.VERSION}})
73
- # these end tokens are hard coded because of the current limitaion of the llm-eval.
74
- # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
75
- self.generation_kwargs = {"until": ["</s>", "<|im_end|>"], "max_gen_toks": 4096}
76
- # self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
77
- # self.generation_kwargs_sampling = {
78
- # "temperature": 0.99,
79
- # "do_sample": True,
80
- # "until": ["<im_end>", "<im_end>"],
81
- # "max_length": 1024,
82
- # }
83
-
84
- def transform_data(self, data):
85
- transformed_data = []
86
- for i in range(len(data)):
87
- if self.configs["baseline"]:
88
- baseline_answer = self.model_answers[self.configs["baseline_model"]][data[i]["question_id"]]
89
- else:
90
- baseline_answer = None
91
- transformed_item = {
92
- "question_id": data[i]["question_id"],
93
- "content": data[i]["turns"][0]["content"], # Assuming you want the first turn's content
94
- "model_answer": baseline_answer
95
- }
96
- transformed_data.append(transformed_item)
97
- return transformed_data
98
-
99
- def has_training_docs(self):
100
- return False
101
-
102
- def has_validation_docs(self):
103
- return True
104
-
105
- def has_test_docs(self):
106
- return False
107
-
108
- def validation_docs(self):
109
- self.dataset = self.transform_data(self.data)
110
- self.dataset = Dataset.from_dict({"question_id": [item["question_id"] for item in self.dataset],
111
- "content": [item["content"] for item in self.dataset],
112
- "model_answer": [item["model_answer"] for item in self.dataset]})
113
- return self.dataset
114
-
115
- def doc_to_text(self, doc):
116
- sentence = doc["content"]
117
- doc_text = f"{sentence}\n"
118
- return doc_text
119
-
120
- def doc_to_target(self, doc):
121
- q_id = doc["question_id"]
122
- return q_id
123
-
124
- def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
125
- arguments = (ctx, self.generation_kwargs)
126
- request_list = [
127
- Instance(request_type="generate_until", doc=doc, arguments=arguments, idx=0, **kwargs),
128
- ]
129
- # sampling_arguments = (ctx, self.generation_kwargs_sampling)
130
- # request_list.extend(
131
- # [
132
- # Instance(request_type="generate_until", doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
133
- # for idx in range(1, self.generation_kwargs_sampling_number + 1)
134
- # ]
135
- # )
136
- return request_list
137
-
138
- def process_results(self, doc, results):
139
- response_temperature_0 = results[0]
140
- # other_responses = results[1:]
141
- api_config_path = os.path.join(os.path.dirname(__file__), "configs/api_config.yaml")
142
- endpoint_list = make_config(api_config_path)
143
-
144
- if self.configs["regex_pattern"]:
145
- pattern = re.compile(self.configs["regex_pattern"])
146
-
147
- ref_answer_dir = os.path.join(os.path.dirname(__file__), "reference_answer")
148
-
149
- ref_answers = None
150
- if self.configs["reference"]:
151
- ref_answers = load_model_answers(ref_answer_dir)
152
- ref_answers = [ref_answers[model] for model in self.configs["ref_model"]]
153
-
154
- # output_files = {}
155
- # models = ["custom_model"]
156
- # output_dir = f"{os.path.join(os.path.dirname(__file__))}/model_judgments/{self.configs['judge_model']}"
157
- # for model in models:
158
- # output_files[model] = os.path.join(
159
- # output_dir,
160
- # f"{model}.jsonl",
161
- # )
162
-
163
- # for output_file in output_files.values():
164
- # os.makedirs(os.path.dirname(output_file), exist_ok=True)
165
-
166
- endpoint_info = endpoint_list[self.configs["judge_model"]]
167
-
168
- question = doc
169
- kwargs = {}
170
- kwargs["question"] = question
171
- kwargs["answer"] = response_temperature_0
172
- if ref_answers:
173
- kwargs["reference"] = [ref_answer[doc["question_id"]] for ref_answer in ref_answers]
174
- assert len(kwargs["reference"]) == len(self.configs["ref_model"])
175
- else:
176
- kwargs["reference"] = None
177
-
178
- if self.configs["baseline"]:
179
- kwargs["baseline_answer"] = doc["model_answer"]
180
- else:
181
- kwargs["baseline_answer"] = None
182
- kwargs["configs"] = self.configs
183
- kwargs["endpoint_dict"] = endpoint_info
184
- # kwargs["output_file"] = output_files["custom_model"]
185
- kwargs["regex_pattern"] = pattern
186
-
187
- scores = judgment(**kwargs)
188
- return {"score": scores}
189
-
190
- def aggregation(self):
191
- """
192
- :returns: {str: [float] -> float}
193
- A dictionary where keys are the names of submetrics and values are
194
- functions that aggregate a list of metrics
195
- """
196
- ##TODO implement the aggregation function to calculate elo for score
197
- def get_win_rate(score_list):
198
- battles = get_battles_from_scores(score_list)
199
- bootstrap_online_elo = compute_mle_elo(battles)
200
- stats = pd.DataFrame()
201
- stats["results"] = None
202
- stats["results"] = stats['results'].astype('object')
203
- for i, model in enumerate(bootstrap_online_elo.index):
204
- stats.at[i, "model"] = model
205
- stats.at[i, "score"] = bootstrap_online_elo[model]
206
-
207
- stats.sort_values(by="model", inplace=True)
208
- stats["score"] = get_win_rate_column(stats, "score", "gpt-4-0314").tolist()
209
-
210
- return stats["score"][1]
211
-
212
- return {k: get_win_rate for k in ["score"]}
213
-
214
- def higher_is_better(self):
215
- """
216
- :returns: {str: bool}
217
- A dictionary where keys are the names of submetrics and values are
218
- whether a higher value of the submetric is better
219
- """
220
- return {k: True for k in ["score"]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/tasks/gsm8k/gsm8k-custom.yaml DELETED
@@ -1,47 +0,0 @@
1
- group:
2
- - math_word_problems
3
- task: gsm8k_custom
4
- dataset_path: gsm8k
5
- dataset_name: main
6
- output_type: generate_until
7
- training_split: train
8
- fewshot_split: train
9
- test_split: test
10
- doc_to_text: "Question: {{question}}\nAnswer:"
11
- doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
12
- metric_list:
13
- - metric: exact_match
14
- aggregation: mean
15
- higher_is_better: true
16
- ignore_case: true
17
- ignore_punctuation: false
18
- regexes_to_ignore:
19
- - ","
20
- - "\\$"
21
- - "(?s).*#### "
22
- - "\\.$"
23
- generation_kwargs:
24
- until:
25
- - "Question:"
26
- - "Question"
27
- - "</s>"
28
- - "<|im_end|>"
29
- do_sample: false
30
- temperature: 0.0
31
- # is_gsm8k: true
32
- repeats: 1
33
- num_fewshot: 5
34
- filter_list:
35
- - name: "strict-match"
36
- filter:
37
- - function: "regex"
38
- regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
39
- - function: "take_first"
40
- - name: "flexible-extract"
41
- filter:
42
- - function: "regex"
43
- group_select: -1
44
- regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
45
- - function: "take_first"
46
- metadata:
47
- version: 3.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/backend/tasks/measurement_task_utils.py CHANGED
@@ -12,9 +12,6 @@ def process_results_decorator(func):
12
  end_to_end_time = sum([r[1] for r in results]) / len(results)
13
  prefilling_time = sum([r[2] for r in results]) / len(results)
14
  decoding_throughput = sum([r[3] for r in results]) / len(results)
15
- mfu = sum([r[4] for r in results]) / len(results)
16
- mbu = sum([r[5] for r in results]) / len(results)
17
-
18
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
19
 
20
  # Now call the original process_results with the processed results
@@ -22,8 +19,6 @@ def process_results_decorator(func):
22
  result_dict["end_to_end_time"] = end_to_end_time
23
  result_dict["prefilling_time"] = prefilling_time
24
  result_dict["decoding_throughput"] = decoding_throughput
25
- result_dict["mfu"] = mfu
26
- result_dict["mbu"] = mbu
27
  return result_dict
28
  return wrapper
29
 
@@ -35,8 +30,6 @@ def aggregation_decorator(func):
35
  aggregation_list["end_to_end_time"] = mean
36
  aggregation_list["prefilling_time"] = mean
37
  aggregation_list["decoding_throughput"] = mean
38
- aggregation_list["mfu"] = mean
39
- aggregation_list["mbu"] = mean
40
  return aggregation_list
41
  return wrapper
42
 
@@ -48,8 +41,6 @@ def higher_is_better_decorator(func):
48
  higher_is_better_dict["end_to_end_time"] = False
49
  higher_is_better_dict["prefilling_time"] = False
50
  higher_is_better_dict["decoding_throughput"] = True
51
- higher_is_better_dict["mfu"] = True
52
- higher_is_better_dict["mbu"] = True
53
  return higher_is_better_dict
54
  return wrapper
55
 
 
12
  end_to_end_time = sum([r[1] for r in results]) / len(results)
13
  prefilling_time = sum([r[2] for r in results]) / len(results)
14
  decoding_throughput = sum([r[3] for r in results]) / len(results)
 
 
 
15
  # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
16
 
17
  # Now call the original process_results with the processed results
 
19
  result_dict["end_to_end_time"] = end_to_end_time
20
  result_dict["prefilling_time"] = prefilling_time
21
  result_dict["decoding_throughput"] = decoding_throughput
 
 
22
  return result_dict
23
  return wrapper
24
 
 
30
  aggregation_list["end_to_end_time"] = mean
31
  aggregation_list["prefilling_time"] = mean
32
  aggregation_list["decoding_throughput"] = mean
 
 
33
  return aggregation_list
34
  return wrapper
35
 
 
41
  higher_is_better_dict["end_to_end_time"] = False
42
  higher_is_better_dict["prefilling_time"] = False
43
  higher_is_better_dict["decoding_throughput"] = True
 
 
44
  return higher_is_better_dict
45
  return wrapper
46
 
src/backend/tasks/selfcheckgpt/task.py CHANGED
@@ -27,12 +27,12 @@ class SelfCheckGPT(ConfigurableTask):
27
  super().__init__(config={"metadata": {"version": self.VERSION}})
28
  # these end tokens are hard coded because of the current limitaion of the llm-eval.
29
  # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
30
- self.generation_kwargs = {"until": ["<|im_end|>"], "max_length": 1024}
31
  self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
32
  self.generation_kwargs_sampling = {
33
  "temperature": 0.99,
34
  "do_sample": True,
35
- "until": ["<|im_end|>", "</s>"],
36
  "max_length": 1024,
37
  }
38
 
 
27
  super().__init__(config={"metadata": {"version": self.VERSION}})
28
  # these end tokens are hard coded because of the current limitaion of the llm-eval.
29
  # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
30
+ self.generation_kwargs = {"until": ["<im_end>"], "max_length": 1024}
31
  self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
32
  self.generation_kwargs_sampling = {
33
  "temperature": 0.99,
34
  "do_sample": True,
35
+ "until": ["<im_end>", "</s>"],
36
  "max_length": 1024,
37
  }
38
 
src/display/about.py CHANGED
@@ -3,36 +3,23 @@ from src.display.utils import ModelType
3
  TITLE = """<h1 align="center" id="space-title">OPEN-MOE-LLM-LEADERBOARD</h1>"""
4
 
5
  INTRODUCTION_TEXT = """
6
- The OPEN-MOE-LLM-LEADERBOARD is specifically designed to assess the performance and efficiency of various Mixture of Experts (MoE) Large Language Models (LLMs).
7
- This initiative, driven by the open-source community, aims to comprehensively evaluate these advanced MoE LLMs.
8
 
9
  The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to measure the performance and efficiency of MOE LLMs.
10
 
11
 
12
  Tasks:
 
13
  - **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
14
- - **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
15
- - **AI Judgment Scores for Responses to Complex User Queries** -- [Arena_Hard](https://lmsys.org/blog/2024-04-19-arena-hard/)
16
 
17
  Columns and Metrics:
18
  - Method: The MOE LLMs inference framework.
19
  - E2E(s): Average End to End generation time in seconds.
20
  - PRE(s): Prefilling Time of input prompt in seconds.
21
  - T/s: Tokens throughout per second.
22
- - S-MBU(%): Sparse Model Bandwidth Utilization.
23
- - S-MFU(%): Sparse Model FLOPs Utilization.
24
  - Precision: The precison of used model.
25
 
26
  """
27
-
28
- ACKNOWLEDGEMENT_TEXT = """
29
- <div>
30
- <h4>Acknowledgements</h4>
31
- {image_html}
32
- <p>We express our sincere gratitude to <a href="https://netmind.ai/home">NetMind.AI</a> for their generous donation of GPUs, which plays a crucial role in ensuring the continuous operation of our Leaderboard.</p>
33
- </div>
34
- """
35
-
36
  LLM_BENCHMARKS_TEXT = f"""
37
 
38
  """
 
3
  TITLE = """<h1 align="center" id="space-title">OPEN-MOE-LLM-LEADERBOARD</h1>"""
4
 
5
  INTRODUCTION_TEXT = """
6
+ The OPEN-MOE-LLM-LEADERBOARD is specifically designed to assess the performance and efficiency of various Mixture of Experts (MoE) Large Language Models (LLMs). This initiative, driven by the open-source community, aims to comprehensively evaluate these advanced MoE LLMs. We extend our gratitude to the Huggingface for the GPU community grant that supported the initial debugging process, and to [NetMind.AI](https://netmind.ai/home) for their generous GPU donation, which ensures the continuous operation of the Leaderboard.
 
7
 
8
  The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to measure the performance and efficiency of MOE LLMs.
9
 
10
 
11
  Tasks:
12
+ - **Generation Self-consistancy** -- [SelfCheckGPT](https://github.com/potsawee/selfcheckgpt)
13
  - **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
 
 
14
 
15
  Columns and Metrics:
16
  - Method: The MOE LLMs inference framework.
17
  - E2E(s): Average End to End generation time in seconds.
18
  - PRE(s): Prefilling Time of input prompt in seconds.
19
  - T/s: Tokens throughout per second.
 
 
20
  - Precision: The precison of used model.
21
 
22
  """
 
 
 
 
 
 
 
 
 
23
  LLM_BENCHMARKS_TEXT = f"""
24
 
25
  """
src/display/imgs/Netmind.AI_LOGO.jpg DELETED
Binary file (6.92 kB)
 
src/display/utils.py CHANGED
@@ -13,33 +13,6 @@ TS = "T/s" #Decoding throughput (tok/s)
13
  InFrame = "Method" #"Inference framework"
14
  MULTIPLE_CHOICEs = ["mmlu"]
15
 
16
- GPU_TEMP = 'Temp(C)'
17
- GPU_Power = 'Power(W)'
18
- GPU_Mem = 'Mem(G)'
19
- GPU_Name = "GPU"
20
- GPU_Util = 'Util(%)'
21
- MFU = 'S-MFU(%)'
22
- MBU = 'S-MBU(%)'
23
- BATCH_SIZE = 'bs'
24
- PRECISION = "Precision"
25
- system_metrics_to_name_map = {
26
- "end_to_end_time": f"{E2Es}",
27
- "prefilling_time": f"{PREs}",
28
- "decoding_throughput": f"{TS}",
29
- "mfu": f"{MFU}",
30
- "mbu": f"{MBU}"
31
- }
32
-
33
- gpu_metrics_to_name_map = {
34
- GPU_Util: GPU_Util,
35
- GPU_TEMP: GPU_TEMP,
36
- GPU_Power: GPU_Power,
37
- GPU_Mem: GPU_Mem,
38
- "batch_size": BATCH_SIZE,
39
- "precision": PRECISION,
40
- GPU_Name: GPU_Name
41
- }
42
-
43
  @dataclass
44
  class Task:
45
  benchmark: str
@@ -77,11 +50,8 @@ class Tasks(Enum):
77
  # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
78
 
79
  # # XXX include me back at some point
80
- # selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
81
  mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
82
- gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
83
- # gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
84
- arena_hard = Task("arena_hard", "score", "Arena Hard") #Arena Hard/Score
85
 
86
 
87
  # These classes are for user facing column names,
@@ -106,35 +76,27 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
106
  # # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
107
 
108
  # Inference framework
109
- auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True, dummy=True)])
110
 
111
  for task in Tasks:
112
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
113
  # System performance metrics
114
- auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
115
- auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
116
- # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
117
- # auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
118
- auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
119
- # auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
120
  if task.value.benchmark in MULTIPLE_CHOICEs:
121
  continue
122
- # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
123
- auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True, hidden=True)])
124
- auto_eval_column_dict.append([f"{task.name}_mbu", ColumnContent, ColumnContent(f"{task.value.col_name} {MBU}", "number", True, hidden=True)])
125
- auto_eval_column_dict.append([f"{task.name}_mfu", ColumnContent, ColumnContent(f"{task.value.col_name} {MFU}", "number", True, hidden=True)])
126
-
127
 
128
  # Model information
129
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, dummy=True)])
130
- # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
131
- # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
132
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True, dummy=True)])
133
- # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
134
- # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
135
- # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
136
- # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
137
- # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
138
  # Dummy column for the search bar (hidden by the custom CSS)
139
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
140
 
@@ -160,10 +122,10 @@ class ModelDetails:
160
 
161
 
162
  class ModelType(Enum):
163
- # PT = ModelDetails(name="pretrained", symbol="🟢")
164
- # FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
165
  chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
166
- # merges = ModelDetails(name="base merges and moerges", symbol="🤝")
167
  Unknown = ModelDetails(name="", symbol="?")
168
 
169
  def to_str(self, separator=" "):
@@ -171,24 +133,21 @@ class ModelType(Enum):
171
 
172
  @staticmethod
173
  def from_str(type):
174
- # if "fine-tuned" in type or "🔶" in type:
175
- # return ModelType.FT
176
- # if "pretrained" in type or "🟢" in type:
177
- # return ModelType.PT
178
  if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
179
  return ModelType.chat
180
- # if "merge" in type or "🤝" in type:
181
- # return ModelType.merges
182
  return ModelType.Unknown
183
 
184
 
185
  class InferenceFramework(Enum):
186
  # "moe-infinity", hf-chat
187
- # MoE_Infinity = ModelDetails("moe-infinity")
188
  HF_Chat = ModelDetails("hf-chat")
189
- VLLM = ModelDetails("vllm_moe")
190
- TRTLLM = ModelDetails("tensorrt_llm")
191
- VLLM_FIX = ModelDetails("vllm_moe_fixbs")
192
  Unknown = ModelDetails("?")
193
 
194
  def to_str(self):
@@ -196,34 +155,13 @@ class InferenceFramework(Enum):
196
 
197
  @staticmethod
198
  def from_str(inference_framework: str):
199
- # if inference_framework in ["moe-infinity"]:
200
- # return InferenceFramework.MoE_Infinity
201
- if inference_framework in ["tensorrt_llm"]:
202
- return InferenceFramework.TRTLLM
203
  if inference_framework in ["hf-chat"]:
204
  return InferenceFramework.HF_Chat
205
- if inference_framework in ["vllm_moe"]:
206
- return InferenceFramework.VLLM
207
- if inference_framework in ["vllm_moe_fixbs"]:
208
- return InferenceFramework.VLLM_FIX
209
  return InferenceFramework.Unknown
210
 
211
- class GPUType(Enum):
212
- A100_sxm = ModelDetails("NVIDIA-A100-SXM4-80GB")
213
- A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
214
- Unknown = ModelDetails("?")
215
 
216
- def to_str(self):
217
- return self.value.name
218
-
219
- @staticmethod
220
- def from_str(gpu_type: str):
221
- if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
222
- return GPUType.A100_pcie
223
- if gpu_type in ["NVIDIA-A100-SXM4-80GB"]:
224
- return GPUType.A100_sxm
225
- return GPUType.Unknown
226
-
227
  class WeightType(Enum):
228
  Adapter = ModelDetails("Adapter")
229
  Original = ModelDetails("Original")
@@ -231,34 +169,34 @@ class WeightType(Enum):
231
 
232
 
233
  class Precision(Enum):
234
- # float32 = ModelDetails("float32")
235
- # float16 = ModelDetails("float16")
236
  bfloat16 = ModelDetails("bfloat16")
237
  qt_8bit = ModelDetails("8bit")
238
  qt_4bit = ModelDetails("4bit")
239
- # qt_GPTQ = ModelDetails("GPTQ")
240
  Unknown = ModelDetails("?")
241
 
242
  @staticmethod
243
  def from_str(precision: str):
244
- # if precision in ["torch.float32", "float32"]:
245
- # return Precision.float32
246
- # if precision in ["torch.float16", "float16"]:
247
- # return Precision.float16
248
  if precision in ["torch.bfloat16", "bfloat16"]:
249
  return Precision.bfloat16
250
  if precision in ["8bit"]:
251
  return Precision.qt_8bit
252
  if precision in ["4bit"]:
253
  return Precision.qt_4bit
254
- # if precision in ["GPTQ", "None"]:
255
- # return Precision.qt_GPTQ
256
  return Precision.Unknown
257
 
258
 
259
  # Column selection
260
- COLS = [c.name for c in fields(AutoEvalColumn)]
261
- TYPES = [c.type for c in fields(AutoEvalColumn)]
262
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
263
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
264
 
 
13
  InFrame = "Method" #"Inference framework"
14
  MULTIPLE_CHOICEs = ["mmlu"]
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  @dataclass
17
  class Task:
18
  benchmark: str
 
50
  # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
51
 
52
  # # XXX include me back at some point
53
+ selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
54
  mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
 
 
 
55
 
56
 
57
  # These classes are for user facing column names,
 
76
  # # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
77
 
78
  # Inference framework
79
+ auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True)])
80
 
81
  for task in Tasks:
82
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
83
  # System performance metrics
84
+ auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name}-{E2Es}", "number", True)])
 
 
 
 
 
85
  if task.value.benchmark in MULTIPLE_CHOICEs:
86
  continue
87
+ auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name}-{PREs}", "number", True)])
88
+ auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name}-{TS}", "number", True)])
 
 
 
89
 
90
  # Model information
91
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
92
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
93
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
94
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
95
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
96
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
97
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
98
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
99
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
100
  # Dummy column for the search bar (hidden by the custom CSS)
101
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
102
 
 
122
 
123
 
124
  class ModelType(Enum):
125
+ PT = ModelDetails(name="pretrained", symbol="🟢")
126
+ FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
127
  chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
128
+ merges = ModelDetails(name="base merges and moerges", symbol="🤝")
129
  Unknown = ModelDetails(name="", symbol="?")
130
 
131
  def to_str(self, separator=" "):
 
133
 
134
  @staticmethod
135
  def from_str(type):
136
+ if "fine-tuned" in type or "🔶" in type:
137
+ return ModelType.FT
138
+ if "pretrained" in type or "🟢" in type:
139
+ return ModelType.PT
140
  if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
141
  return ModelType.chat
142
+ if "merge" in type or "🤝" in type:
143
+ return ModelType.merges
144
  return ModelType.Unknown
145
 
146
 
147
  class InferenceFramework(Enum):
148
  # "moe-infinity", hf-chat
149
+ MoE_Infinity = ModelDetails("moe-infinity")
150
  HF_Chat = ModelDetails("hf-chat")
 
 
 
151
  Unknown = ModelDetails("?")
152
 
153
  def to_str(self):
 
155
 
156
  @staticmethod
157
  def from_str(inference_framework: str):
158
+ if inference_framework in ["moe-infinity"]:
159
+ return InferenceFramework.MoE_Infinity
 
 
160
  if inference_framework in ["hf-chat"]:
161
  return InferenceFramework.HF_Chat
 
 
 
 
162
  return InferenceFramework.Unknown
163
 
 
 
 
 
164
 
 
 
 
 
 
 
 
 
 
 
 
165
  class WeightType(Enum):
166
  Adapter = ModelDetails("Adapter")
167
  Original = ModelDetails("Original")
 
169
 
170
 
171
  class Precision(Enum):
172
+ float32 = ModelDetails("float32")
173
+ float16 = ModelDetails("float16")
174
  bfloat16 = ModelDetails("bfloat16")
175
  qt_8bit = ModelDetails("8bit")
176
  qt_4bit = ModelDetails("4bit")
177
+ qt_GPTQ = ModelDetails("GPTQ")
178
  Unknown = ModelDetails("?")
179
 
180
  @staticmethod
181
  def from_str(precision: str):
182
+ if precision in ["torch.float32", "float32"]:
183
+ return Precision.float32
184
+ if precision in ["torch.float16", "float16"]:
185
+ return Precision.float16
186
  if precision in ["torch.bfloat16", "bfloat16"]:
187
  return Precision.bfloat16
188
  if precision in ["8bit"]:
189
  return Precision.qt_8bit
190
  if precision in ["4bit"]:
191
  return Precision.qt_4bit
192
+ if precision in ["GPTQ", "None"]:
193
+ return Precision.qt_GPTQ
194
  return Precision.Unknown
195
 
196
 
197
  # Column selection
198
+ COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
199
+ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
200
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
201
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
202
 
src/leaderboard/read_evals.py CHANGED
@@ -65,11 +65,11 @@ class EvalResult:
65
  if len(org_and_model) == 1:
66
  org = None
67
  model = org_and_model[0]
68
- result_key = f"{model}_{precision.value.name}_{inference_framework}"
69
  else:
70
  org = org_and_model[0]
71
  model = org_and_model[1]
72
- result_key = f"{org}_{model}_{precision.value.name}_{inference_framework}"
73
  full_model = "/".join(org_and_model)
74
 
75
  still_on_hub, error, model_config = is_model_on_hub(
@@ -103,13 +103,6 @@ class EvalResult:
103
 
104
  if to_add is True:
105
  multiplier = 100.0
106
- if "GPU" in metric:
107
- results[benchmark][metric] = value
108
- continue
109
- if "precision" in metric:
110
- results[benchmark][metric] = value
111
- continue
112
-
113
  if "rouge" in metric and "truthful" not in benchmark:
114
  multiplier = 1.0
115
  if "squad" in benchmark:
@@ -118,17 +111,9 @@ class EvalResult:
118
  multiplier = 1.0
119
  if "throughput" in metric:
120
  multiplier = 1.0
121
- if "batch_" in metric or "Mem" in metric or "Util" in metric:
122
- multiplier = 1
123
-
124
  # print('RESULTS', data['results'])
125
  # print('XXX', benchmark, metric, value, multiplier)
126
- if value == "N/A":
127
- results[benchmark][metric] = "-"
128
- elif value == "auto":
129
- results[benchmark][metric] = "auto"
130
- else:
131
- results[benchmark][metric] = value * multiplier
132
 
133
  res = EvalResult(
134
  eval_name=result_key,
@@ -140,7 +125,6 @@ class EvalResult:
140
  revision=config.get("model_sha", ""),
141
  still_on_hub=still_on_hub,
142
  architecture=architecture,
143
- model_type=ModelType.from_str(config.get("model_type", "")),
144
  inference_framework=inference_framework,
145
  )
146
 
@@ -175,22 +159,22 @@ class EvalResult:
175
 
176
  # breakpoint()
177
  # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
178
-
179
  data_dict = {
180
  "eval_name": self.eval_name, # not a column, just a save name,
181
  AutoEvalColumn.precision.name: self.precision.value.name,
182
- # AutoEvalColumn.model_type.name: self.model_type.value.name,
183
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
184
- # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
185
- # AutoEvalColumn.architecture.name: self.architecture,
186
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
187
  AutoEvalColumn.dummy.name: self.full_model,
188
- # AutoEvalColumn.revision.name: self.revision,
189
- # # AutoEvalColumn.average.name: average,
190
- # AutoEvalColumn.license.name: self.license,
191
- # AutoEvalColumn.likes.name: self.likes,
192
- # AutoEvalColumn.params.name: self.num_params,
193
- # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
194
  AutoEvalColumn.inference_framework.name: self.inference_framework,
195
  }
196
 
@@ -278,22 +262,15 @@ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool
278
 
279
  eval_results = {}
280
  for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
281
- try:
282
- # Creation of result
283
- eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
284
- eval_result.update_with_request_file(requests_path)
285
-
286
- # Store results of same eval together
287
- eval_name = eval_result.eval_name
288
- if eval_name in eval_results.keys():
289
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
290
- else:
291
- eval_results[eval_name] = eval_result
292
-
293
- except (FileNotFoundError, ValueError, KeyError, json.JSONDecodeError) as e:
294
- # Log the error and continue with the next file
295
- print(f"Error processing file {model_result_filepath}: {e}")
296
- continue
297
 
298
  results = []
299
  for v in eval_results.values():
 
65
  if len(org_and_model) == 1:
66
  org = None
67
  model = org_and_model[0]
68
+ result_key = f"{model}_{precision.value.name}"
69
  else:
70
  org = org_and_model[0]
71
  model = org_and_model[1]
72
+ result_key = f"{org}_{model}_{precision.value.name}"
73
  full_model = "/".join(org_and_model)
74
 
75
  still_on_hub, error, model_config = is_model_on_hub(
 
103
 
104
  if to_add is True:
105
  multiplier = 100.0
 
 
 
 
 
 
 
106
  if "rouge" in metric and "truthful" not in benchmark:
107
  multiplier = 1.0
108
  if "squad" in benchmark:
 
111
  multiplier = 1.0
112
  if "throughput" in metric:
113
  multiplier = 1.0
 
 
 
114
  # print('RESULTS', data['results'])
115
  # print('XXX', benchmark, metric, value, multiplier)
116
+ results[benchmark][metric] = value * multiplier
 
 
 
 
 
117
 
118
  res = EvalResult(
119
  eval_name=result_key,
 
125
  revision=config.get("model_sha", ""),
126
  still_on_hub=still_on_hub,
127
  architecture=architecture,
 
128
  inference_framework=inference_framework,
129
  )
130
 
 
159
 
160
  # breakpoint()
161
  # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
162
+
163
  data_dict = {
164
  "eval_name": self.eval_name, # not a column, just a save name,
165
  AutoEvalColumn.precision.name: self.precision.value.name,
166
+ AutoEvalColumn.model_type.name: self.model_type.value.name,
167
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
168
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
169
+ AutoEvalColumn.architecture.name: self.architecture,
170
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
171
  AutoEvalColumn.dummy.name: self.full_model,
172
+ AutoEvalColumn.revision.name: self.revision,
173
+ # AutoEvalColumn.average.name: average,
174
+ AutoEvalColumn.license.name: self.license,
175
+ AutoEvalColumn.likes.name: self.likes,
176
+ AutoEvalColumn.params.name: self.num_params,
177
+ AutoEvalColumn.still_on_hub.name: self.still_on_hub,
178
  AutoEvalColumn.inference_framework.name: self.inference_framework,
179
  }
180
 
 
262
 
263
  eval_results = {}
264
  for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
265
+ # Creation of result
266
+ eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
267
+ eval_result.update_with_request_file(requests_path)
268
+ # Store results of same eval together
269
+ eval_name = eval_result.eval_name
270
+ if eval_name in eval_results.keys():
271
+ eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
272
+ else:
273
+ eval_results[eval_name] = eval_result
 
 
 
 
 
 
 
274
 
275
  results = []
276
  for v in eval_results.values():
src/populate.py CHANGED
@@ -12,7 +12,7 @@ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, update_
12
 
13
  from src.backend.envs import Tasks as BackendTasks
14
  from src.display.utils import Tasks
15
- from src.display.utils import system_metrics_to_name_map, gpu_metrics_to_name_map
16
 
17
  def get_leaderboard_df(
18
  results_path: str,
@@ -45,7 +45,12 @@ def get_leaderboard_df(
45
  bm = (task.benchmark, task.metric)
46
  name_to_bm_map[name] = bm
47
 
48
-
 
 
 
 
 
49
 
50
  all_data_json = []
51
  for entry in all_data_json_:
@@ -58,9 +63,6 @@ def get_leaderboard_df(
58
  if sys_metric in entry[k]:
59
  new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
60
 
61
- for gpu_metric, metric_namne in gpu_metrics_to_name_map.items():
62
- if gpu_metric in entry[k]:
63
- new_entry[f"{k} {metric_namne}"] = entry[k][gpu_metric]
64
  all_data_json += [new_entry]
65
 
66
  # all_data_json.append(baseline_row)
 
12
 
13
  from src.backend.envs import Tasks as BackendTasks
14
  from src.display.utils import Tasks
15
+ from src.display.utils import E2Es, PREs, TS
16
 
17
  def get_leaderboard_df(
18
  results_path: str,
 
45
  bm = (task.benchmark, task.metric)
46
  name_to_bm_map[name] = bm
47
 
48
+ # bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
49
+ system_metrics_to_name_map = {
50
+ "end_to_end_time": f"{E2Es}",
51
+ "prefilling_time": f"{PREs}",
52
+ "decoding_throughput": f"{TS}",
53
+ }
54
 
55
  all_data_json = []
56
  for entry in all_data_json_:
 
63
  if sys_metric in entry[k]:
64
  new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
65
 
 
 
 
66
  all_data_json += [new_entry]
67
 
68
  # all_data_json.append(baseline_row)
src/submission/check_validity.py CHANGED
@@ -74,7 +74,7 @@ def is_model_on_hub(
74
 
75
 
76
  def get_model_size(model_info: ModelInfo, precision: str):
77
- size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
78
  try:
79
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
80
  except (AttributeError, TypeError):
@@ -130,8 +130,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
130
  continue
131
  with open(os.path.join(root, file), "r") as f:
132
  info = json.load(f)
133
- if not info["status"] == "FINISHED" and not info["status"] == "RUNNING":
134
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
135
 
136
  # Select organisation
137
  if info["model"].count("/") == 0 or "submitted_time" not in info:
 
74
 
75
 
76
  def get_model_size(model_info: ModelInfo, precision: str):
77
+ size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
78
  try:
79
  model_size = round(model_info.safetensors["total"] / 1e9, 3)
80
  except (AttributeError, TypeError):
 
130
  continue
131
  with open(os.path.join(root, file), "r") as f:
132
  info = json.load(f)
133
+ file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}")
 
134
 
135
  # Select organisation
136
  if info["model"].count("/") == 0 or "submitted_time" not in info:
src/submission/submit.py CHANGED
@@ -26,8 +26,7 @@ def add_new_eval(
26
  weight_type: str,
27
  model_type: str,
28
  inference_framework: str,
29
- debug: bool = False,
30
- gpu_type: str = "NVIDIA-A100-PCIe-80GB",
31
  ):
32
  global REQUESTED_MODELS
33
  global USERS_TO_SUBMISSION_DATES
@@ -115,18 +114,17 @@ def add_new_eval(
115
  "params": model_size,
116
  "license": license,
117
  "inference_framework": inference_framework,
118
- "gpu_type": gpu_type
119
  }
120
 
121
  # Check for duplicate submission
122
- if f"{model}_{revision}_{precision}_{inference_framework}_{gpu_type}" in REQUESTED_MODELS:
123
  return styled_warning("This model has been already submitted.")
124
 
125
  print("Creating eval file")
126
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
127
  os.makedirs(OUT_DIR, exist_ok=True)
128
  # out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
129
- out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}_{inference_framework}_{gpu_type}.json"
130
 
131
  with open(out_path, "w") as f:
132
  f.write(json.dumps(eval_entry))
 
26
  weight_type: str,
27
  model_type: str,
28
  inference_framework: str,
29
+ debug: bool = False
 
30
  ):
31
  global REQUESTED_MODELS
32
  global USERS_TO_SUBMISSION_DATES
 
114
  "params": model_size,
115
  "license": license,
116
  "inference_framework": inference_framework,
 
117
  }
118
 
119
  # Check for duplicate submission
120
+ if f"{model}_{revision}_{precision}_{inference_framework}" in REQUESTED_MODELS:
121
  return styled_warning("This model has been already submitted.")
122
 
123
  print("Creating eval file")
124
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
125
  os.makedirs(OUT_DIR, exist_ok=True)
126
  # out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
127
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}_{inference_framework}.json"
128
 
129
  with open(out_path, "w") as f:
130
  f.write(json.dumps(eval_entry))
src/utils.py CHANGED
@@ -1,56 +1,6 @@
1
  import pandas as pd
2
  from huggingface_hub import snapshot_download
3
- import subprocess
4
- import re
5
- import os
6
- import GPUtil
7
 
8
- try:
9
- from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
10
- except:
11
- print("local debug: from display.utils")
12
- from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
13
-
14
- MEM_BW_DICT ={
15
- "NVIDIA-A100-PCIe-80GB": 1935,
16
- "NVIDIA-A100-SXM-80GB": 2039,
17
- "NVIDIA-H100-PCIe-80GB": 2039,
18
- "NVIDIA-RTX-A5000-24GB": 768
19
- }
20
-
21
- PEAK_FLOPS_DICT = {
22
- "float32":{
23
- "NVIDIA-A100-PCIe-80GB": 312e12,
24
- "NVIDIA-A100-SXM-80GB": 312e12,
25
- "NVIDIA-H100-PCIe-80GB": 756e12,
26
- "NVIDIA-RTX-A5000-24GB": 222.2e12
27
- },
28
- "float16":{
29
- "NVIDIA-A100-PCIe-80GB": 624e12,
30
- "NVIDIA-A100-SXM-80GB": 624e12,
31
- "NVIDIA-H100-PCIe-80GB": 1513e12,
32
- "NVIDIA-RTX-A5000-24GB": 444.4e12
33
- },
34
- "bfloat16":{
35
- "NVIDIA-A100-PCIe-80GB": 624e12,
36
- "NVIDIA-A100-SXM-80GB": 624e12,
37
- "NVIDIA-H100-PCIe-80GB": 1513e12,
38
- "NVIDIA-RTX-A5000-24GB": 444.4e12
39
- },
40
- "8bit":{
41
- "NVIDIA-A100-PCIe-80GB": 1248e12,
42
- "NVIDIA-A100-SXM-80GB": 1248e12,
43
- "NVIDIA-H100-PCIe-80GB": 3026e12,
44
- "NVIDIA-RTX-A5000-24GB": 889e12
45
- },
46
- "4bit": {
47
- "NVIDIA-A100-PCIe-80GB": 2496e12,
48
- "NVIDIA-A100-SXM-80GB": 2496e12,
49
- "NVIDIA-H100-PCIe-80GB": 6052e12,
50
- "NVIDIA-RTX-A5000-24GB": 1778e12
51
- }
52
-
53
- }
54
 
55
  def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
56
  for i in range(10):
@@ -82,130 +32,3 @@ def get_dataset_summary_table(file_path):
82
  df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
83
 
84
  return df
85
-
86
- def parse_nvidia_smi():
87
- visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
88
- if visible_devices is not None:
89
- gpu_indices = visible_devices.split(',')
90
- else:
91
- # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
92
- result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
93
- if result.returncode != 0:
94
- print("Failed to query GPU indices.")
95
- return []
96
- gpu_indices = result.stdout.strip().split('\n')
97
- # print(f"gpu_indices: {gpu_indices}")
98
- gpu_stats = []
99
-
100
- gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
101
- # gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
102
- gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
103
-
104
- gpu_name = ""
105
- for index in gpu_indices:
106
- result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
107
- output = result.stdout.strip()
108
- lines = output.split("\n")
109
- for line in lines:
110
- match = gpu_info_pattern.search(line)
111
- name_match = gpu_name_pattern.search(line)
112
- gpu_info = {}
113
- if name_match:
114
- gpu_name = ''.join(filter(None, name_match.groups())).strip()
115
- if match:
116
- temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
117
- gpu_info.update({
118
- GPU_TEMP: temp,
119
- GPU_Power: power_usage,
120
- GPU_Mem: round(mem_usage / 1024, 2),
121
- GPU_Util: gpu_util
122
- })
123
-
124
- if len(gpu_info) >= 4:
125
- gpu_stats.append(gpu_info)
126
- # print(f"gpu_stats: {gpu_stats}")
127
- gpu_name = f"{len(gpu_stats)}x{gpu_name}"
128
- gpu_stats_total = {
129
- GPU_TEMP: 0,
130
- GPU_Power: 0,
131
- GPU_Mem: 0,
132
- GPU_Util: 0,
133
- GPU_Name: gpu_name
134
- }
135
- for gpu_stat in gpu_stats:
136
- gpu_stats_total[GPU_TEMP] += gpu_stat[GPU_TEMP]
137
- gpu_stats_total[GPU_Power] += gpu_stat[GPU_Power]
138
- gpu_stats_total[GPU_Mem] += gpu_stat[GPU_Mem]
139
- gpu_stats_total[GPU_Util] += gpu_stat[GPU_Util]
140
- gpu_stats_total[GPU_Mem] = gpu_stats_total[GPU_Mem] # G
141
- gpu_stats_total[GPU_TEMP] /= len(gpu_stats)
142
- gpu_stats_total[GPU_Power] /= len(gpu_stats)
143
- gpu_stats_total[GPU_Util] /= len(gpu_stats)
144
- return [gpu_stats_total]
145
-
146
- def monitor_gpus(stop_event, interval, stats_list):
147
- while not stop_event.is_set():
148
- gpu_stats = parse_nvidia_smi()
149
- if gpu_stats:
150
- stats_list.extend(gpu_stats)
151
- stop_event.wait(interval)
152
-
153
- def analyze_gpu_stats(stats_list):
154
- # Check if the stats_list is empty, and return None if it is
155
- if not stats_list:
156
- return None
157
-
158
- # Initialize dictionaries to store the stats
159
- avg_stats = {}
160
- max_stats = {}
161
-
162
- # Calculate average stats, excluding 'GPU_Mem'
163
- for key in stats_list[0].keys():
164
- if key != GPU_Mem and key != GPU_Name:
165
- total = sum(d[key] for d in stats_list)
166
- avg_stats[key] = total / len(stats_list)
167
-
168
- # Calculate max stats for 'GPU_Mem'
169
- max_stats[GPU_Mem] = max(d[GPU_Mem] for d in stats_list)
170
- if GPU_Name in stats_list[0]:
171
- avg_stats[GPU_Name] = stats_list[0][GPU_Name]
172
- # Update average stats with max GPU memory usage
173
- avg_stats.update(max_stats)
174
-
175
- return avg_stats
176
-
177
- def get_gpu_details():
178
- gpus = GPUtil.getGPUs()
179
- gpu = gpus[0]
180
- name = gpu.name.replace(" ", "-")
181
- memory_gb = round(gpu.memoryTotal / 1024)
182
- memory = f"{memory_gb}GB"
183
-
184
- for part in name.split('-'):
185
- if part.endswith("GB") and part[:-2].isdigit():
186
- name = name.replace(f"-{part}", "").replace(part, "")
187
-
188
- formatted_name = f"{name}-{memory}"
189
-
190
- return formatted_name
191
-
192
- def get_peak_bw(gpu_name):
193
- return MEM_BW_DICT[gpu_name]
194
-
195
- def get_peak_flops(gpu_name, precision):
196
- return PEAK_FLOPS_DICT[precision][gpu_name]
197
-
198
- def transfer_precision2bytes(precision):
199
- if precision == "float32":
200
- return 4
201
- elif precision in ["float16", "bfloat16"]:
202
- return 2
203
- elif precision == "8bit":
204
- return 1
205
- elif precision == "4bit":
206
- return 0.5
207
- else:
208
- raise ValueError(f"Unsupported precision: {precision}")
209
-
210
- if __name__ == "__main__":
211
- print(analyze_gpu_stats(parse_nvidia_smi()))
 
1
  import pandas as pd
2
  from huggingface_hub import snapshot_download
 
 
 
 
3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
6
  for i in range(10):
 
32
  df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
33
 
34
  return df