Spaces:

sparse-generative-ai
/

open-moe-llm-leaderboard

Running

App Files Files Community

support dynamic refresh page

#15

by AppleSwing - opened Apr 14

base: refs/heads/main

←

from: refs/pr/15

Discussion Files changed

+399

-3097

Files changed (30) hide show

LICENSE +0 -201
README.md +1 -1
app.py +261 -296
backend-cli.py +30 -121
requirements.txt +3 -7
src/backend/envs.py +1 -4
src/backend/hflm_with_measurement.py +18 -167
src/backend/manage_requests.py +5 -5
src/backend/moe_infinity.py +4 -9
src/backend/run_eval_suite.py +1 -9
src/backend/tasks/arena_hard/__init__.py +0 -0
src/backend/tasks/arena_hard/arena_hard.yaml +0 -2
src/backend/tasks/arena_hard/arena_judgment.py +0 -256
src/backend/tasks/arena_hard/arena_utils.py +0 -349
src/backend/tasks/arena_hard/configs/api_config.yaml +0 -17
src/backend/tasks/arena_hard/configs/judge_config.yaml +0 -26
src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl +0 -0
src/backend/tasks/arena_hard/question.jsonl +0 -0
src/backend/tasks/arena_hard/task.py +0 -220
src/backend/tasks/gsm8k/gsm8k-custom.yaml +0 -47
src/backend/tasks/measurement_task_utils.py +0 -9
src/backend/tasks/selfcheckgpt/task.py +2 -2
src/display/about.py +2 -15
src/display/imgs/Netmind.AI_LOGO.jpg +0 -0
src/display/utils.py +37 -99
src/leaderboard/read_evals.py +22 -45
src/populate.py +7 -5
src/submission/check_validity.py +2 -3
src/submission/submit.py +3 -5
src/utils.py +0 -177

LICENSE DELETED Viewed

@@ -1,201 +0,0 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright [yyyy] [name of copyright owner]
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🔥
 colorFrom: green
 colorTo: indigo
 sdk: gradio
-sdk_version: 4.26.0
 app_file: app.py
 pinned: true
 license: apache-2.0

 colorFrom: green
 colorTo: indigo
 sdk: gradio
+sdk_version: 4.9.0
 app_file: app.py
 pinned: true
 license: apache-2.0

app.py CHANGED Viewed

@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 import os
 import datetime
 import socket
-import base64
 from threading import Thread
 import gradio as gr
@@ -11,7 +11,6 @@ import time
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
-from pytz import utc
 from src.display.about import (
     CITATION_BUTTON_LABEL,
@@ -22,7 +21,6 @@ from src.display.about import (
     LLM_BENCHMARKS_DETAILS,
     FAQ_TEXT,
     TITLE,
-    ACKNOWLEDGEMENT_TEXT,
 )
 from src.display.css_html_js import custom_css
@@ -39,7 +37,6 @@ from src.display.utils import (
     fields,
     WeightType,
     Precision,
-    GPUType
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, \
@@ -76,7 +73,7 @@ def restart_space():
 def init_space():
-    # dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
     if socket.gethostname() not in {"neuromancer"}:
         # sync model_type with open-llm-leaderboard
@@ -91,19 +88,7 @@ def init_space():
     finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
         EVAL_REQUESTS_PATH, EVAL_COLS
     )
-    # return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
-    return None, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
-def add_benchmark_columns(shown_columns):
-    benchmark_columns = []
-    for benchmark in BENCHMARK_COLS:
-        if benchmark in shown_columns:
-            for c in COLS:
-                if benchmark in c and benchmark != c:
-                    benchmark_columns.append(c)
-    return benchmark_columns
 # Searching and filtering
 def update_table(
@@ -111,8 +96,7 @@ def update_table(
 ):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
     filtered_df = filter_queries(query, filtered_df)
-    benchmark_columns = add_benchmark_columns(columns)
-    df = select_columns(filtered_df, columns + benchmark_columns)
     return df
@@ -160,7 +144,6 @@ def filter_models(df: pd.DataFrame, type_query: list, size_query: list, precisio
     type_emoji = [t[0] for t in type_query]
     filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
     filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
-    filtered_df = filtered_df.loc[df[AutoEvalColumn.inference_framework.name].isin(size_query)]
     # numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
     # params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
@@ -173,176 +156,154 @@ shown_columns = None
 dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
 leaderboard_df = original_df.copy()
-# def update_leaderboard_table():
-#     global leaderboard_df, shown_columns
-#     print("Updating leaderboard table")
-#     return leaderboard_df[
-#                 [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
-#                 + shown_columns.value
-#                 + [AutoEvalColumn.dummy.name]
-#             ] if not leaderboard_df.empty else leaderboard_df
-# def update_hidden_leaderboard_table():
-#     global original_df
-#     return original_df[COLS] if original_df.empty is False else original_df
-# def update_dataset_table():
-#     global dataset_df
-#     return dataset_df
-# def update_finish_table():
-#     global finished_eval_queue_df
-#     return finished_eval_queue_df
-# def update_running_table():
-#     global running_eval_queue_df
-#     return running_eval_queue_df
-# def update_pending_table():
-#     global pending_eval_queue_df
-#     return pending_eval_queue_df
-# def update_finish_num():
-#     global finished_eval_queue_df
-#     return len(finished_eval_queue_df)
-# def update_running_num():
-#     global running_eval_queue_df
-#     return len(running_eval_queue_df)
-# def update_pending_num():
-#     global pending_eval_queue_df
-#     return len(pending_eval_queue_df)
 # triggered only once at startup => read query parameter if it exists
 def load_query(request: gr.Request):
     query = request.query_params.get("query") or ""
     return query
-def get_image_html(url, image_path):
-    with open(image_path, "rb") as image_file:
-        encoded_string = base64.b64encode(image_file.read()).decode()
-    return f'<a href="{url}" target="_blank"><img src="data:image/jpg;base64,{encoded_string}" alt="NetMind.AI Logo" style="width:100pt;"></a>'
-# Prepare the HTML content with the image
-image_html = get_image_html("https://netmind.ai/home", "./src/display/imgs/Netmind.AI_LOGO.jpg")
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    gr.HTML(ACKNOWLEDGEMENT_TEXT.format(image_html=image_html))
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("open-moe-llm-leaderboard", elem_id="llm-benchmark-tab-table", id=0):
-            with gr.Row():
-                with gr.Column():
-                    with gr.Row():
-                        search_bar = gr.Textbox(
-                            placeholder=" 🔍 Model search (separate multiple queries with `;`)",
-                            show_label=False,
-                            elem_id="search-bar"
                         )
-                    with gr.Row():
-                        shown_columns = gr.CheckboxGroup(
-                            choices=[
-                                c.name
-                                for c in fields(AutoEvalColumn)
-                                if not c.hidden and not c.never_hidden and not c.dummy
-                            ],
-                            value=[
-                                c.name
-                                for c in fields(AutoEvalColumn)
-                                if c.displayed_by_default and not c.hidden and not c.never_hidden
-                            ],
-                            label="Tasks",
-                            elem_id="column-select",
                             interactive=True,
                         )
-                with gr.Column(min_width=320):
-                    filter_columns_size = gr.CheckboxGroup(
-                        label="Inference frameworks",
-                        choices=[t.to_str() for t in InferenceFramework],
-                        value=[t.to_str() for t in InferenceFramework],
-                        interactive=True,
-                        elem_id="filter-columns-size",
-                    )
-                    filter_columns_type = gr.CheckboxGroup(
-                        label="Model types",
-                        choices=[t.to_str() for t in ModelType],
-                        value=[t.to_str() for t in ModelType],
-                        interactive=True,
-                        elem_id="filter-columns-type",
-                    )
-                    filter_columns_precision = gr.CheckboxGroup(
-                        label="Precision",
-                        choices=[i.value.name for i in Precision],
-                        value=[i.value.name for i in Precision],
-                        interactive=True,
-                        elem_id="filter-columns-precision",
-                    )
-                    # filter_columns_size = gr.CheckboxGroup(
-                    #     label="Model sizes (in billions of parameters)",
-                    #     choices=list(NUMERIC_INTERVALS.keys()),
-                    #     value=list(NUMERIC_INTERVALS.keys()),
-                    #     interactive=True,
-                    #     elem_id="filter-columns-size",
-                    # )
-            # breakpoint()
-            benchmark_columns = add_benchmark_columns(shown_columns.value)
-            leaderboard_table = gr.components.Dataframe(
-                value=(
-                    leaderboard_df[
-                        [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
-                        + shown_columns.value
-                        + benchmark_columns
-                        + [AutoEvalColumn.dummy.name]
-                    ]
-                    if leaderboard_df.empty is False
-                    else leaderboard_df
-                ),
-                headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value + benchmark_columns,
-                datatype=TYPES,
-                elem_id="leaderboard-table",
-                interactive=False,
-                visible=True,
-            )  # column_widths=["2%", "20%"]
-            # Dummy leaderboard for handling the case when the user uses backspace key
-            hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                value=original_df[COLS] if original_df.empty is False else original_df,
-                headers=COLS,
-                datatype=TYPES,
-                visible=False,
-            )
-            search_bar.submit(
-                update_table,
-                [
-                    hidden_leaderboard_table_for_search,
-                    shown_columns,
-                    filter_columns_type,
-                    filter_columns_precision,
-                    filter_columns_size,
-                    search_bar,
-                ],
-                leaderboard_table
-            )
-            # Check query parameter once at startup and update search bar
-            demo.load(load_query, inputs=[], outputs=[search_bar])
-            for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
-                selector.change(
                     update_table,
                     [
                         hidden_leaderboard_table_for_search,
@@ -353,133 +314,137 @@ with demo:
                         search_bar,
                     ],
                     leaderboard_table,
-                    queue=True,
                 )
-        # with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
-        #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        #     dataset_table = gr.components.Dataframe(
-        #         value=dataset_df,
-        #         headers=list(dataset_df.columns),
-        #         datatype=["str", "markdown", "str", "str", "str"],
-        #         elem_id="dataset-table",
-        #         interactive=False,
-        #         visible=True,
-        #         column_widths=["15%", "20%"],
-        #     )
-        #     gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
-        #     gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
-                            )
-                    with gr.Accordion(f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
-                            )
-                    with gr.Accordion(f"⏳ Scheduled Evaluation Queue ({len(pending_eval_queue_df)})", open=False):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5
-                            )
-            with gr.Row():
-                gr.Markdown("# Submit your model here", elem_classes="markdown-text")
-            with gr.Row():
-                inference_framework = gr.Dropdown(
-                    choices=[t.to_str() for t in InferenceFramework],
-                    label="Inference framework",
-                    multiselect=False,
-                    value=None,
-                    interactive=True,
-                )
-                gpu_type = gr.Dropdown(
-                    choices=[t.to_str() for t in GPUType],
-                    label="GPU type",
-                    multiselect=False,
-                    value="NVIDIA-A100-PCIe-80GB",
-                    interactive=True,
                 )
-            with gr.Row():
                 with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
                         multiselect=False,
                         value=None,
                         interactive=True,
                     )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float32",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            debug = gr.Checkbox(value=args.debug, label="Debug", visible=False)
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    private,
-                    weight_type,
-                    model_type,
-                    inference_framework,
-                    debug,
-                    gpu_type
-                ],
-                submission_result,
-            )
-    with gr.Row():
-        with gr.Accordion("Citing this leaderboard", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-scheduler = BackgroundScheduler(timezone=utc)
 scheduler.add_job(restart_space, "interval", hours=6)
@@ -490,9 +455,9 @@ def launch_backend():
     if DEVICE not in {"cpu"}:
         _ = subprocess.run(["python", "backend-cli.py"])
-# Thread(target=periodic_init, daemon=True).start()
 # scheduler.add_job(launch_backend, "interval", seconds=120)
 if __name__ == "__main__":
     scheduler.start()
-    demo.queue(default_concurrency_limit=40).launch()

 #!/usr/bin/env python
 import os
 import datetime
 import socket
 from threading import Thread
 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.display.about import (
     CITATION_BUTTON_LABEL,
     LLM_BENCHMARKS_DETAILS,
     FAQ_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
     fields,
     WeightType,
     Precision,
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, \
 def init_space():
+    dataset_df = get_dataset_summary_table(file_path="blog/Hallucination-Leaderboard-Summary.csv")
     if socket.gethostname() not in {"neuromancer"}:
         # sync model_type with open-llm-leaderboard
     finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(
         EVAL_REQUESTS_PATH, EVAL_COLS
     )
+    return dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
 # Searching and filtering
 def update_table(
 ):
     filtered_df = filter_models(hidden_df, type_query, size_query, precision_query)
     filtered_df = filter_queries(query, filtered_df)
+    df = select_columns(filtered_df, columns)
     return df
     type_emoji = [t[0] for t in type_query]
     filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
     filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
     # numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
     # params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
 dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
 leaderboard_df = original_df.copy()
+def update_leaderboard_table():
+    global leaderboard_df, shown_columns
+    print("Updating leaderboard table")
+    return leaderboard_df[
+                [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
+                + shown_columns.value
+                + [AutoEvalColumn.dummy.name]
+            ] if not leaderboard_df.empty else leaderboard_df
+def update_hidden_leaderboard_table():
+    global original_df
+    return original_df[COLS] if original_df.empty is False else original_df
+def update_dataset_table():
+    global dataset_df
+    return dataset_df
+def update_finish_table():
+    global finished_eval_queue_df
+    return finished_eval_queue_df
+def update_running_table():
+    global running_eval_queue_df
+    return running_eval_queue_df
+def update_pending_table():
+    global pending_eval_queue_df
+    return pending_eval_queue_df
+def update_finish_num():
+    global finished_eval_queue_df
+    return len(finished_eval_queue_df)
+def update_running_num():
+    global running_eval_queue_df
+    return len(running_eval_queue_df)
+def update_pending_num():
+    global pending_eval_queue_df
+    return len(pending_eval_queue_df)
 # triggered only once at startup => read query parameter if it exists
 def load_query(request: gr.Request):
     query = request.query_params.get("query") or ""
     return query
+def refresh_leaderboard():
+    return gr.update(value=update_leaderboard_table()), gr.update(value=update_hidden_leaderboard_table()), \
+    gr.update(value=update_dataset_table()), gr.update(value=update_finish_table()), \
+    gr.update(value=update_running_table()), gr.update(value=update_pending_table()), \
+    gr.update(value=update_finish_num()), gr.update(value=update_running_num()), gr.update(value=update_pending_num())
+def periodic_init():
+    global dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, leaderboard_df
+    while True:
+        time.sleep(60)
+        dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = init_space()
+        leaderboard_df = original_df.copy()
+def block_launch():
+    global dataset_df, original_df, finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, leaderboard_df, shown_columns
+    demo = gr.Blocks(css=custom_css)
+    with demo:
+        gr.HTML(TITLE)
+        gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+        with gr.Tabs(elem_classes="tab-buttons") as tabs:
+            with gr.TabItem("MOE-LLM-GPU-Poor-Leaderboard Benchmark", elem_id="llm-benchmark-tab-table", id=0):
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Row():
+                            search_bar = gr.Textbox(
+                                placeholder=" 🔍 Model search (separate multiple queries with `;`)",
+                                show_label=False,
+                                elem_id="search-bar",
+                            )
+                        with gr.Row():
+                            shown_columns = gr.CheckboxGroup(
+                                choices=[
+                                    c.name
+                                    for c in fields(AutoEvalColumn)
+                                    if not c.hidden and not c.never_hidden and not c.dummy
+                                ],
+                                value=[
+                                    c.name
+                                    for c in fields(AutoEvalColumn)
+                                    if c.displayed_by_default and not c.hidden and not c.never_hidden
+                                ],
+                                label="Select columns to show",
+                                elem_id="column-select",
+                                interactive=True,
+                            )
+                    with gr.Column(min_width=320):
+                        filter_columns_size = gr.CheckboxGroup(
+                            label="Inference frameworks",
+                            choices=[t.to_str() for t in InferenceFramework],
+                            value=[t.to_str() for t in InferenceFramework],
+                            interactive=True,
+                            elem_id="filter-columns-size",
                         )
+                        filter_columns_type = gr.CheckboxGroup(
+                            label="Model types",
+                            choices=[t.to_str() for t in ModelType],
+                            value=[t.to_str() for t in ModelType],
                             interactive=True,
+                            elem_id="filter-columns-type",
                         )
+                        filter_columns_precision = gr.CheckboxGroup(
+                            label="Precision",
+                            choices=[i.value.name for i in Precision],
+                            value=[i.value.name for i in Precision],
+                            interactive=True,
+                            elem_id="filter-columns-precision",
+                        )
+                        # filter_columns_size = gr.CheckboxGroup(
+                        #     label="Model sizes (in billions of parameters)",
+                        #     choices=list(NUMERIC_INTERVALS.keys()),
+                        #     value=list(NUMERIC_INTERVALS.keys()),
+                        #     interactive=True,
+                        #     elem_id="filter-columns-size",
+                        # )
+                # breakpoint()
+                refresh_button = gr.Button("Refresh", visible=True)
+                leaderboard_table = gr.components.Dataframe(
+                    value=(
+                        leaderboard_df[
+                            [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
+                            + shown_columns.value
+                            + [AutoEvalColumn.dummy.name]
+                        ]
+                        if leaderboard_df.empty is False
+                        else leaderboard_df
+                    ),
+                    headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
+                    datatype=TYPES,
+                    elem_id="leaderboard-table",
+                    interactive=False,
+                    visible=True,
+                )  # column_widths=["2%", "20%"]
+                # Dummy leaderboard for handling the case when the user uses backspace key
+                hidden_leaderboard_table_for_search = gr.components.Dataframe(
+                    value=original_df[COLS] if original_df.empty is False else original_df,
+                    headers=COLS,
+                    datatype=TYPES,
+                    visible=False,
+                )
+                # refresh_button.click(fn=update_leaderboard_tables, outputs=[leaderboard_table, hidden_leaderboard_table_for_search])
+                search_bar.submit(
                     update_table,
                     [
                         hidden_leaderboard_table_for_search,
                         search_bar,
                     ],
                     leaderboard_table,
                 )
+                # Check query parameter once at startup and update search bar
+                demo.load(load_query, inputs=[], outputs=[search_bar])
+                for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size]:
+                    selector.change(
+                        update_table,
+                        [
+                            hidden_leaderboard_table_for_search,
+                            shown_columns,
+                            filter_columns_type,
+                            filter_columns_precision,
+                            filter_columns_size,
+                            search_bar,
+                        ],
+                        leaderboard_table,
+                        queue=True,
+                    )
+            with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
+                gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+                dataset_table = gr.components.Dataframe(
+                    value=dataset_df,
+                    headers=list(dataset_df.columns),
+                    datatype=["str", "markdown", "str", "str", "str"],
+                    elem_id="dataset-table",
+                    interactive=False,
+                    visible=True,
+                    column_widths=["15%", "20%"],
                 )
+                gr.Markdown(LLM_BENCHMARKS_DETAILS, elem_classes="markdown-text")
+                gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
+                # refresh_button.click(fn=update_dataset_table, outputs=[dataset_table])
+            with gr.TabItem("Submit a model ", elem_id="llm-benchmark-tab-table", id=3):
                 with gr.Column():
+                    with gr.Row():
+                        gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                    with gr.Column():
+                        with gr.Accordion(f"✅ Finished Evaluations", open=False):
+                            with gr.Column():
+                                num_fin = gr.Number(len(finished_eval_queue_df), label="Number of finished evaluations", visible=True, interactive=False)
+                                with gr.Row():
+                                    finished_eval_table = gr.components.Dataframe(
+                                        value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=6
+                                    )
+                        with gr.Accordion(f"🔄 Running Evaluation Queue", open=False):
+                            with gr.Column():
+                                num_run = gr.Number(len(running_eval_queue_df), label="Number of running evaluations", visible=True, interactive=False)
+                                with gr.Row():
+                                    running_eval_table = gr.components.Dataframe(
+                                        value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=6
+                                    )
+                        with gr.Accordion(f"⏳ Scheduled Evaluation Queue", open=False):
+                            with gr.Column():
+                                num_sche = gr.Number(len(pending_eval_queue_df), label="Number of scheduled evaluations", visible=True, interactive=False)
+                                with gr.Row():
+                                    pending_eval_table = gr.components.Dataframe(
+                                        value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=6
+                                    )
+                        # refresh_button.click(fn=update_submit_tables,
+                        #                      outputs=[finished_eval_table, running_eval_table, pending_eval_table])
+                with gr.Row():
+                    gr.Markdown("# Submit your model here", elem_classes="markdown-text")
+                with gr.Row():
+                    inference_framework = gr.Dropdown(
+                        choices=[t.to_str() for t in InferenceFramework],
+                        label="Inference framework",
                         multiselect=False,
                         value=None,
                         interactive=True,
                     )
+                with gr.Row():
+                    with gr.Column():
+                        model_name_textbox = gr.Textbox(label="Model name")
+                        revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                        private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
+                        model_type = gr.Dropdown(
+                            choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
+                            label="Model type",
+                            multiselect=False,
+                            value=None,
+                            interactive=True,
+                        )
+                    with gr.Column():
+                        precision = gr.Dropdown(
+                            choices=[i.value.name for i in Precision if i != Precision.Unknown],
+                            label="Precision",
+                            multiselect=False,
+                            value="float32",
+                            interactive=True,
+                        )
+                        weight_type = gr.Dropdown(
+                            choices=[i.value.name for i in WeightType],
+                            label="Weights type",
+                            multiselect=False,
+                            value="Original",
+                            interactive=True,
+                        )
+                        base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
+                submit_button = gr.Button("Submit Eval")
+                submission_result = gr.Markdown()
+                debug = gr.Checkbox(args.debug, label="Debug", visible=False)
+                submit_button.click(
+                    add_new_eval,
+                    [
+                        model_name_textbox,
+                        base_model_name_textbox,
+                        revision_name_textbox,
+                        precision,
+                        private,
+                        weight_type,
+                        model_type,
+                        inference_framework,
+                        debug
+                    ],
+                    submission_result,
+                )
+        refresh_button.click(refresh_leaderboard,
+                             outputs=[leaderboard_table, hidden_leaderboard_table_for_search, dataset_table,
+                                      finished_eval_table, running_eval_table, pending_eval_table, num_fin, num_run, num_sche])
+        with gr.Row():
+            with gr.Accordion("Citing this leaderboard", open=False):
+                citation_button = gr.Textbox(
+                    value=CITATION_BUTTON_TEXT,
+                    label=CITATION_BUTTON_LABEL,
+                    lines=20,
+                    elem_id="citation-button",
+                    show_copy_button=True,
+                )
+    demo.queue(default_concurrency_limit=40).launch()
+scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", hours=6)
     if DEVICE not in {"cpu"}:
         _ = subprocess.run(["python", "backend-cli.py"])
+Thread(target=periodic_init, daemon=True).start()
 # scheduler.add_job(launch_backend, "interval", seconds=120)
 if __name__ == "__main__":
     scheduler.start()
+    block_launch()

backend-cli.py CHANGED Viewed

@@ -6,7 +6,6 @@ import argparse
 import socket
 import random
-import threading
 from datetime import datetime
 from src.backend.run_eval_suite import run_evaluation
@@ -16,20 +15,18 @@ from src.backend.envs import Tasks, EVAL_REQUESTS_PATH_BACKEND, EVAL_RESULTS_PAT
 from src.backend.manage_requests import EvalRequest
 from src.leaderboard.read_evals import EvalResult
-from src.envs import QUEUE_REPO, RESULTS_REPO, API, DEBUG_QUEUE_REPO, DEBUG_RESULTS_REPO
-from src.utils import my_snapshot_download, analyze_gpu_stats, parse_nvidia_smi, monitor_gpus, get_gpu_details
 from src.leaderboard.read_evals import get_raw_eval_results
 from typing import Optional
-import GPUtil
 import time
 import pprint
 import logging
-from lm_eval.filters.extraction import RegexFilter
 # Configure the root logger
 logging.basicConfig(
@@ -44,20 +41,6 @@ eval_logger = logging.getLogger("lm-eval")
 # Explicitly set the level for 'lm-eval' logger to WARNING
 eval_logger.setLevel(logging.WARNING)
-def tuple_input_decorator(func):
-    def wrapper(self, resps, docs):
-        stripped_resps = [[resp_data[0] for resp_data in group] for group in resps]
-        filtered_resps = func(self, stripped_resps, docs)
-        combined_resps = []
-        for original_group, new_group in zip(resps, filtered_resps):
-            combined_group = [(new_resp,) + rest_of_data[1:] for new_resp, rest_of_data in zip(new_group, original_group)]
-            combined_resps.append(combined_group)
-        return combined_resps
-    return wrapper
 def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
     for i in range(10):
@@ -140,23 +123,7 @@ def request_to_result_name(request: EvalRequest) -> str:
 def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
-    batch_size = 1
-    batch_size = eval_request.batch_size
-    init_gpu_info = analyze_gpu_stats(parse_nvidia_smi())
-    # if init_gpu_info['Mem(M)'] > 500:
-    #     assert False, f"This machine is not empty: {init_gpu_info}"
-    gpu_stats_list = []
-    stop_event = threading.Event()
-    monitor_thread = threading.Thread(target=monitor_gpus, args=(stop_event, 5, gpu_stats_list))
-    monitor_thread.start()
-    original_apply = RegexFilter.apply
-    if task.benchmark in ["gsm8k", "gsm8k_cot", "gsm8k_cot_self_consistency", "gsm8k_custom"]:
-        RegexFilter.apply = tuple_input_decorator(RegexFilter.apply)
-    else:
-        RegexFilter.apply = original_apply
     try:
         results = run_evaluation(
             eval_request=eval_request,
@@ -183,20 +150,6 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
             raise
     # print("RESULTS", results)
-    stop_event.set()
-    monitor_thread.join()
-    gpu_info = analyze_gpu_stats(gpu_stats_list)
-    for task_name in results['results'].keys():
-        for key, value in gpu_info.items():
-            if "GPU" not in key:
-                results['results'][task_name][f"{key},none"] = int(value)
-            else:
-                results['results'][task_name][f"{key},none"] = value
-        results['results'][task_name]['batch_size,none'] = batch_size
-        results['results'][task_name]['precision,none'] = eval_request.precision
-    print(f"gpu_stats_list: {gpu_stats_list}")
-    print("GPU Usage:", gpu_info)
     dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
     # print(dumped)
@@ -217,8 +170,6 @@ def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[in
         repo_id=RESULTS_REPO,
         repo_type="dataset",
     )
-    RegexFilter.apply = original_apply
     return results
@@ -387,9 +338,10 @@ def maybe_refresh_results(thr: int, hard_task_lst: Optional[list[str]] = None) -
     return False
 def process_pending_requests() -> bool:
     sanity_checks()
-    print("Processing pending requests")
     current_pending_status = [PENDING_STATUS]
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
@@ -408,12 +360,6 @@ def process_pending_requests() -> bool:
     eval_request = eval_requests[0]
     pp.pprint(eval_request)
-    gpu_type = eval_request.gpu_type
-    curr_gpu_type = get_gpu_details()
-    if gpu_type != curr_gpu_type:
-        print(f"GPU type mismatch: {gpu_type} vs {curr_gpu_type}")
-        return False
     my_snapshot_download(
         repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
@@ -450,15 +396,11 @@ def get_args():
     parser = argparse.ArgumentParser(description="Run the backend")
     parser.add_argument("--debug", action="store_true", help="Run in debug mode")
     # debug parameters
-    parser.add_argument("--task", type=str, default="selfcheckgpt,mmlu, gsm8k", help="Task to debug")
-    parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1,mistralai/Mixtral-8x7B-v0.1", help="Model to debug")
-    parser.add_argument("--precision", type=str, default="float32,float16,8bit,4bit", help="Precision to debug")
     parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
     parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
-    parser.add_argument("--gpu-type", type=str, default="NVIDIA-A100-PCIe-80GB",
-                        help="GPU type. NVIDIA-A100-PCIe-80GB; NVIDIA-RTX-A5000-24GB; NVIDIA-H100-PCIe-80GB")
-    parser.add_argument("--debug_repo", action="store_true", help="Use debug repo")
-    parser.add_argument("--model_type", type=str, default="chat", help="Model type")
     return parser.parse_args()
@@ -466,76 +408,43 @@ if __name__ == "__main__":
     args = get_args()
     local_debug = args.debug
     # debug specific task by ping
-    if local_debug and not args.debug_repo:
-        # debug_model_names = [args.model]  # Use model from arguments
-        # debug_task_name = [args.task]  # Use task from arguments
-        debug_model_names = args.model.split(",")
-        debug_task_name = args.task.split(",")
-        precisions = args.precision.split(",")
-        print(f"debug_model_names: {debug_model_names}, debug_task_name: {debug_task_name}, precisions: {precisions}")
         task_lst = TASKS_HARNESS.copy()
-        RESULTS_REPO = DEBUG_RESULTS_REPO
-        for precision in precisions:
             for debug_model_name in debug_model_names:
-                for task in task_lst:
-                    task_name = task.benchmark
-                    if task_name not in debug_task_name:
-                        continue
-                    # try:
-                    eval_request = EvalRequest(
-                        model=debug_model_name,
-                        private=False,
-                        status="",
-                        json_filepath="",
-                        precision=precision,  # Use precision from arguments
-                        inference_framework=args.inference_framework,  # Use inference framework from arguments
-                        gpu_type=args.gpu_type,
-                        model_type=args.model_type,
-                    )
-                    curr_gpu_type = get_gpu_details()
-                    if eval_request.gpu_type != curr_gpu_type:
-                        print(f"GPU type mismatch: {eval_request.gpu_type} vs {curr_gpu_type}")
-                        raise Exception("GPU type mismatch")
-                    results = process_evaluation(task, eval_request, limit=args.limit)
-                    # except Exception as e:
-                    #     print(f"debug running error: {e}")
-    elif local_debug and args.debug_repo:
-        QUEUE_REPO = DEBUG_QUEUE_REPO
-        RESULTS_REPO = DEBUG_RESULTS_REPO
         while True:
             res = False
             # if random.randint(0, 10) == 0:
             res = process_pending_requests()
             print(f"waiting for 60 seconds")
             time.sleep(60)
             # if res is False:
             #     if random.randint(0, 5) == 0:
             #         res = maybe_refresh_results(100)
             #     else:
             #         res = process_finished_requests(100)
             # time.sleep(60)
             # if res is False:
             #     if random.randint(0, 5) == 0:
             #         res = maybe_refresh_results(0)
             #     else:
             #         res = process_finished_requests(0)
-    elif not local_debug and not args.debug_repo:
-        while True:
-           res = False
-           # if random.randint(0, 10) == 0:
-           res = process_pending_requests()
-           print(f"waiting for 60 seconds")
-           time.sleep(60)
-           # if res is False:
-           #     if random.randint(0, 5) == 0:
-           #         res = maybe_refresh_results(100)
-           #     else:
-           #         res = process_finished_requests(100)
-           # time.sleep(60)
-           # if res is False:
-           #     if random.randint(0, 5) == 0:
-           #         res = maybe_refresh_results(0)
-           #     else:
-           #         res = process_finished_requests(0)
-    else:
-        raise Exception("Cannot use debug_repo without local debug flag")

 import socket
 import random
 from datetime import datetime
 from src.backend.run_eval_suite import run_evaluation
 from src.backend.manage_requests import EvalRequest
 from src.leaderboard.read_evals import EvalResult
+from src.envs import QUEUE_REPO, RESULTS_REPO, API
+from src.utils import my_snapshot_download
 from src.leaderboard.read_evals import get_raw_eval_results
 from typing import Optional
 import time
 import pprint
 import logging
 # Configure the root logger
 logging.basicConfig(
 # Explicitly set the level for 'lm-eval' logger to WARNING
 eval_logger.setLevel(logging.WARNING)
 def my_set_eval_request(api, eval_request, set_to_status, hf_repo, local_dir):
     for i in range(10):
 def process_evaluation(task: Task, eval_request: EvalRequest, limit: Optional[int] = None) -> dict:
+    batch_size = 4
     try:
         results = run_evaluation(
             eval_request=eval_request,
             raise
     # print("RESULTS", results)
     dumped = json.dumps(results, indent=2, default=lambda o: "<not serializable>")
     # print(dumped)
         repo_id=RESULTS_REPO,
         repo_type="dataset",
     )
     return results
     return False
 def process_pending_requests() -> bool:
     sanity_checks()
     current_pending_status = [PENDING_STATUS]
     # Get all eval request that are PENDING, if you want to run other evals, change this parameter
     eval_request = eval_requests[0]
     pp.pprint(eval_request)
     my_snapshot_download(
         repo_id=QUEUE_REPO, revision="main", local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset", max_workers=60
     parser = argparse.ArgumentParser(description="Run the backend")
     parser.add_argument("--debug", action="store_true", help="Run in debug mode")
     # debug parameters
+    parser.add_argument("--task", type=str, default="selfcheckgpt", help="Task to debug")
+    parser.add_argument("--model", type=str, default="facebook/opt-1.3b", help="Model to debug")
+    parser.add_argument("--precision", type=str, default="float16", help="Precision to debug")
     parser.add_argument("--inference-framework", type=str, default="hf-chat", help="Inference framework to debug")
     parser.add_argument("--limit", type=int, default=None, help="Limit for the number of samples")
     return parser.parse_args()
     args = get_args()
     local_debug = args.debug
     # debug specific task by ping
+    if local_debug:
+        debug_model_names = [args.model]  # Use model from arguments
+        debug_task_name = args.task  # Use task from arguments
         task_lst = TASKS_HARNESS.copy()
+        for task in task_lst:
             for debug_model_name in debug_model_names:
+                task_name = task.benchmark
+                if task_name != debug_task_name:
+                    continue
+                eval_request = EvalRequest(
+                    model=debug_model_name,
+                    private=False,
+                    status="",
+                    json_filepath="",
+                    precision=args.precision,  # Use precision from arguments
+                    inference_framework=args.inference_framework  # Use inference framework from arguments
+                )
+                results = process_evaluation(task, eval_request, limit=args.limit)
+    else:
         while True:
             res = False
             # if random.randint(0, 10) == 0:
             res = process_pending_requests()
             print(f"waiting for 60 seconds")
             time.sleep(60)
             # if res is False:
             #     if random.randint(0, 5) == 0:
             #         res = maybe_refresh_results(100)
             #     else:
             #         res = process_finished_requests(100)
             # time.sleep(60)
             # if res is False:
             #     if random.randint(0, 5) == 0:
             #         res = maybe_refresh_results(0)
             #     else:
             #         res = process_finished_requests(0)

requirements.txt CHANGED Viewed

@@ -4,7 +4,7 @@ APScheduler
 black
 click
 datasets
-gradio==4.26.0
 gradio_client
 huggingface-hub
 matplotlib
@@ -16,7 +16,7 @@ requests
 semantic-version
 tqdm
 wandb
-transformers
 tokenizers>=0.15.0
 lm_eval[ifeval] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.2
 accelerate
@@ -27,10 +27,6 @@ cchardet
 rouge_score
 bert-score
 evaluate
-spacy==3.7.4
 selfcheckgpt
 immutabledict
-gputil
-bitsandbytes
-openai
-scikit-learn

 black
 click
 datasets
+gradio
 gradio_client
 huggingface-hub
 matplotlib
 semantic-version
 tqdm
 wandb
+transformers>=4.36.0
 tokenizers>=0.15.0
 lm_eval[ifeval] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@v0.4.2
 accelerate
 rouge_score
 bert-score
 evaluate
+spacy
 selfcheckgpt
 immutabledict

src/backend/envs.py CHANGED Viewed

@@ -57,13 +57,10 @@ class Tasks(Enum):
     # task20 = Task("race", "acc", "RACE", 0)
     task21 = Task("mmlu", "acc", "MMLU", 5)
-    task22 = Task("gsm8k_custom", "em", "GSM8K", 5)
-    # task23 = Task("gsm8k_cot", "em", "GSM8K", 8)
-    task24 = Task("arena_hard", "score", "Arena Hard", 0)
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
-DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

     # task20 = Task("race", "acc", "RACE", 0)
     task21 = Task("mmlu", "acc", "MMLU", 5)
 EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

src/backend/hflm_with_measurement.py CHANGED Viewed

@@ -24,7 +24,7 @@ from transformers.models.auto.modeling_auto import (
     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
 )
 from transformers import TextStreamer
-from transformers.models.dbrx.modeling_dbrx import DbrxExpertGLU
 from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
@@ -37,9 +37,6 @@ from lm_eval.models.utils import (
     stop_sequences_criteria,
 )
 from lm_eval.models.huggingface import HFLM
-from src.utils import get_gpu_details, get_peak_bw, transfer_precision2bytes, get_peak_flops
-from src.submission.check_validity import get_model_size
-from src.envs import API
 class StopWatch(TextStreamer):
@@ -60,31 +57,16 @@ class StopWatch(TextStreamer):
             self.start_decoding = time()
         self.decoding_iterations += 1
         return
     def end(self):
         if self.decoding_time is None and self.start_decoding is not None:
             self.decoding_time = time() - self.start_decoding
         return
 class HFLMWithMeasurement(HFLM):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
-        self.pretrained = kwargs.get("pretrained", None)
-        self.revision = kwargs.get("revision", None)
-        self.precision = kwargs.get("dtype", None)
-        self.num_gpus = None
-    def _detect_num_gpus_used(self):
-        if self.num_gpus is not None:
-            return self.num_gpus
-        gpus = []
-        for p in self.model.parameters():
-            if p.device.type == "cuda":
-                gpus.append(p.device.index)
-        self.num_gpus = len(set(gpus))
-        return self.num_gpus
     def _loglikelihood_tokens(
         self,
@@ -297,7 +279,7 @@ class HFLMWithMeasurement(HFLM):
                     # Answer: (log prob, is-exact-match)
                     answer = (float(logits.sum()), bool(max_equal))
-                    res.append((answer, per_sample_time, 0, 0, 0, 0))
                     self.cache_hook.add_partial("loglikelihood", request_str, answer)
                     pbar.update(1)
@@ -305,16 +287,14 @@ class HFLMWithMeasurement(HFLM):
         pbar.close()
         return re_ord.get_original(res)
-    def _model_generate(self, context, max_tokens, stop, **generation_kwargs):
         # temperature = 0.0 if not set
         # if do_sample is false and temp==0.0:
         # remove temperature, as do_sample=False takes care of this
         # and we don't want a warning from HF
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
         do_sample = generation_kwargs.get("do_sample", None)
-        # is_gsm8k = generation_kwargs.get("is_gsm8k", False)
         # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
         if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
@@ -322,52 +302,7 @@ class HFLMWithMeasurement(HFLM):
         if do_sample is False and generation_kwargs.get("temperature") == 0.0:
             generation_kwargs.pop("temperature")
-        # if is_gsm8k:
-        #     generation_kwargs.pop("is_gsm8k")
-        context_length = context.shape[1]
-        if self.model.__class__.__name__ == "MoE":
-            model_config = self.model.model.config
-        else:
-            model_config = self.model.config
-        if not self.precision:
-            if model_config.quantization_config._load_in_4bit:
-                self.precision = "4bit"
-            elif model_config.quantization_config._load_in_8bit:
-                self.precision = "8bit"
-            else:
-                raise ValueError("Unknown precision")
-        # print(self.model)
-        linear_count = 0
-        element_wise_mul = 0
-        for name, module in self.model.named_modules():
-            if ('layers.0.' in name or "transformer.blocks.0" in name) and ('attn' not in name):
-                if 'experts.0.' in name or "ffn.experts" in name:
-                    if "linear_v" in name:
-                        element_wise_mul = 1
-                    if isinstance(module, torch.nn.Linear):
-                        # print(name, module)
-                        linear_count += 1
-                    elif isinstance(module, DbrxExpertGLU):
-                        linear_count = 3
-                        element_wise_mul = 1
-                # elif 'experts' not in name:
-                #     if ("gate" not in name and "router" not in name) or "gate_proj" in name:
-                #         if "gate_proj" in name:
-                #             element_wise_mul = 1
-                #         if isinstance(module, torch.nn.Linear):
-                #             # print(name, module)
-                #             linear_count += 1
-                else:
-                    continue
-        print(f"linear_count: {linear_count}")
-        print(f"element_wise_mul: {element_wise_mul}")
-        print(f"GPU usage: {self._detect_num_gpus_used()}")
         stopping_criteria = stop_sequences_criteria(
             self.tokenizer, stop, context.shape[1], context.shape[0]
         )
@@ -375,7 +310,7 @@ class HFLMWithMeasurement(HFLM):
         start = time()
         res = self.model.generate(
             input_ids=context,
-            max_new_tokens=max_tokens,
             stopping_criteria=stopping_criteria,
             pad_token_id=self.tokenizer.pad_token_id,
             use_cache=True,
@@ -383,86 +318,15 @@ class HFLMWithMeasurement(HFLM):
             **generation_kwargs,
         )
         end = time()
         batch_size = context.shape[0]
         output_length = stop_watch.decoding_iterations
-        precision_bytes = transfer_precision2bytes(self.precision)
-        model_size_param = sum(p.numel() for p in self.model.parameters())
-        n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else \
-            (model_config.num_layers if hasattr(model_config, "num_layers") else model_config.n_layers)
-        d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
-        if hasattr(model_config, "num_experts_per_tok"):
-            n_experts_per_tok = model_config.num_experts_per_tok
-        elif hasattr(model_config, "num_selected_experts"):
-            n_experts_per_tok = model_config.num_selected_experts
-        elif hasattr(model_config, "ffn_config"):
-            n_experts_per_tok = model_config.ffn_config.moe_top_k
-        else:
-            n_experts_per_tok = 1
-        if hasattr(model_config, "ffn_dim"):
-            d_ff = model_config.ffn_dim
-        elif hasattr(model_config, "intermediate_size"):
-            d_ff = model_config.intermediate_size
-        elif hasattr(model_config, "d_ff"):
-            d_ff = model_config.d_ff
-        elif hasattr(model_config, "ff_ratio"):
-            d_ff = d_model * model_config.ff_ratio
-        elif hasattr(model_config, "ffn_config"):
-            d_ff = model_config.ffn_config.ffn_hidden_size
-        else:
-            raise ValueError("Unknown FFN dimension")
-        if hasattr(model_config, "num_local_experts"):
-            num_experts = model_config.num_local_experts
-        elif hasattr(model_config, "num_experts"):
-            num_experts = model_config.num_experts
-        elif hasattr(model_config, "ffn_config"):
-            num_experts = model_config.ffn_config.moe_num_experts
-        else:
-            num_experts = 1
-        ffn_params = n_layers * d_ff * linear_count * d_model
-        shared_params = model_size_param - num_experts * ffn_params
-        model_size = shared_params + n_experts_per_tok * ffn_params
-        per_token_kv_size = 2 * n_layers * d_model * precision_bytes
-        peak_bw_single = get_peak_bw(get_gpu_details())
-        peak_bw = peak_bw_single * self._detect_num_gpus_used()
-        context_prefill_size = context_length
-        kv_size = context_prefill_size * per_token_kv_size + (output_length - 1) * per_token_kv_size / 2
-        kv_size = kv_size / 1e9
-        n_vocab = model_config.vocab_size
         end_to_end_time = (end - start) / batch_size
         prefilling_time = stop_watch.prefilling_time / batch_size
         decoding_time = stop_watch.decoding_time / batch_size
         token_per_sec = output_length / decoding_time
-        achieve_mem_bw = (model_size * precision_bytes / 1e9 + kv_size) * token_per_sec
-        avg_context_length = context_length + (output_length - 1) / 2
-        flops_per_token = 2 * model_size + ((linear_count + element_wise_mul) * n_layers * avg_context_length * d_model) + 4 * d_model + 2 * d_model * n_vocab
-        peak_flops_single = get_peak_flops(get_gpu_details(), self.precision)
-        peak_flops = peak_flops_single * self._detect_num_gpus_used()
-        ## TODO only support llama-type decoder only models and moe models of switch transformer and mixtrial
-        mfu = token_per_sec * flops_per_token / peak_flops
-        mbu = achieve_mem_bw / peak_bw
-        print(f"mfu: {mfu}, mbu: {mbu}")
-        return res, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
@@ -539,19 +403,11 @@ class HFLMWithMeasurement(HFLM):
                     f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                 )
             # add EOS token to stop sequences
-            eos = "<|eot_id|>"
             if not until:
                 until = [eos]
             else:
                 until.append(eos)
-            # is_gsm8k = kwargs.get("is_gsm8k", False)
-            # if is_gsm8k:
-            #     until = ["Question:", "Question", "</s>"]
-            #     eos_ids = [self.tokenizer.eos_token_id,
-            #              self.tokenizer.convert_tokens_to_ids("<|eot_id|>")]
             if "max_gen_toks" in kwargs.keys():
                 max_gen_toks = kwargs.pop("max_gen_toks")
             else:
@@ -571,16 +427,14 @@ class HFLMWithMeasurement(HFLM):
                 left_truncate_len=max_ctx_len,
                 truncation=self.truncation,
             )
-            # print("context: ", self.tok_decode(context_enc[0]))
             context_enc = context_enc.to(self.device)
             attn_masks = attn_masks.to(self.device)
-            if "max_tokens" not in kwargs:
-                kwargs["max_tokens"] = max_gen_toks
             # perform batched generation
-            cont, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu = self._model_generate(
                 context=context_enc,
                 attention_mask=attn_masks,
                 stop=until,
@@ -591,21 +445,18 @@ class HFLMWithMeasurement(HFLM):
             for cont_toks, context in zip(cont_toks_list, contexts):
                 # discard context + left-padding toks if using causal decoder-only LM
                 if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-                    # print("After Generation: ", self.tok_decode(cont_toks))
                     cont_toks = cont_toks[context_enc.shape[1] :]
                 s = self.tok_decode(cont_toks)
-                # # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
-                # if not is_gsm8k:
                 for term in until:
                     if len(term) > 0:
                         # ignore '' separator,
                         # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
                         s = s.split(term)[0]
-                # print(s)
-                res.append((s, end_to_end_time, prefilling_time, token_per_sec, mfu, mbu))
                 self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
                 pbar.update(1)

     MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
 )
 from transformers import TextStreamer
 from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.model import TemplateLM
     stop_sequences_criteria,
 )
 from lm_eval.models.huggingface import HFLM
 class StopWatch(TextStreamer):
             self.start_decoding = time()
         self.decoding_iterations += 1
         return
     def end(self):
         if self.decoding_time is None and self.start_decoding is not None:
             self.decoding_time = time() - self.start_decoding
         return
 class HFLMWithMeasurement(HFLM):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
     def _loglikelihood_tokens(
         self,
                     # Answer: (log prob, is-exact-match)
                     answer = (float(logits.sum()), bool(max_equal))
+                    res.append((answer, per_sample_time, 0, 0))
                     self.cache_hook.add_partial("loglikelihood", request_str, answer)
                     pbar.update(1)
         pbar.close()
         return re_ord.get_original(res)
+    def _model_generate(self, context, max_length, stop, **generation_kwargs):
         # temperature = 0.0 if not set
         # if do_sample is false and temp==0.0:
         # remove temperature, as do_sample=False takes care of this
         # and we don't want a warning from HF
         generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
         do_sample = generation_kwargs.get("do_sample", None)
         # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
         if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
         if do_sample is False and generation_kwargs.get("temperature") == 0.0:
             generation_kwargs.pop("temperature")
+        # build stopping criteria
         stopping_criteria = stop_sequences_criteria(
             self.tokenizer, stop, context.shape[1], context.shape[0]
         )
         start = time()
         res = self.model.generate(
             input_ids=context,
+            max_length=max_length,
             stopping_criteria=stopping_criteria,
             pad_token_id=self.tokenizer.pad_token_id,
             use_cache=True,
             **generation_kwargs,
         )
         end = time()
         batch_size = context.shape[0]
         output_length = stop_watch.decoding_iterations
         end_to_end_time = (end - start) / batch_size
         prefilling_time = stop_watch.prefilling_time / batch_size
         decoding_time = stop_watch.decoding_time / batch_size
         token_per_sec = output_length / decoding_time
+        return res, end_to_end_time, prefilling_time, token_per_sec
     def generate_until(
         self, requests: List[Instance], disable_tqdm: bool = False
                     f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
                 )
             # add EOS token to stop sequences
+            eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
             if not until:
                 until = [eos]
             else:
                 until.append(eos)
             if "max_gen_toks" in kwargs.keys():
                 max_gen_toks = kwargs.pop("max_gen_toks")
             else:
                 left_truncate_len=max_ctx_len,
                 truncation=self.truncation,
             )
             context_enc = context_enc.to(self.device)
             attn_masks = attn_masks.to(self.device)
+            if "max_length" not in kwargs:
+                kwargs["max_length"] = context_enc.shape[1] + max_gen_toks
             # perform batched generation
+            cont, end_to_end_time, prefilling_time, token_per_sec = self._model_generate(
                 context=context_enc,
                 attention_mask=attn_masks,
                 stop=until,
             for cont_toks, context in zip(cont_toks_list, contexts):
                 # discard context + left-padding toks if using causal decoder-only LM
                 if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
                     cont_toks = cont_toks[context_enc.shape[1] :]
                 s = self.tok_decode(cont_toks)
+                # use secondary stop seqs to cut off should-have-been-stopped content post-hoc
                 for term in until:
                     if len(term) > 0:
                         # ignore '' separator,
                         # for seq2seq case where self.tok_decode(self.eot_token_id) = ''
                         s = s.split(term)[0]
+                res.append((s, end_to_end_time, prefilling_time, token_per_sec))
                 self.cache_hook.add_partial("generate_until", (context, gen_kwargs), s)
                 pbar.update(1)

src/backend/manage_requests.py CHANGED Viewed

@@ -27,24 +27,24 @@ class EvalRequest:
     likes: Optional[int] = 0
     params: Optional[int] = None
     license: Optional[str] = ""
-    batch_size: Optional[int] = 1
-    gpu_type: Optional[str] = "NVIDIA-A100-PCIe-80GB"
     def get_model_args(self) -> str:
         model_args = f"pretrained={self.model},revision={self.revision},parallelize=True"  # ,max_length=4096"
-        model_args += ",trust_remote_code=True,device_map=auto"
         if self.precision in ["float16", "float32", "bfloat16"]:
             model_args += f",dtype={self.precision}"
             # Quantized models need some added config, the install of bits and bytes, etc
             # elif self.precision == "8bit":
             #    model_args += ",load_in_8bit=True"
-        elif self.precision == "4bit":
-           model_args += ",load_in_4bit=True"
             # elif self.precision == "GPTQ":
             # A GPTQ model does not need dtype to be specified,
             # it will be inferred from the config
         elif self.precision == "8bit":
             model_args += ",load_in_8bit=True"
         else:
             raise Exception(f"Unknown precision {self.precision}.")
         return model_args

     likes: Optional[int] = 0
     params: Optional[int] = None
     license: Optional[str] = ""
     def get_model_args(self) -> str:
         model_args = f"pretrained={self.model},revision={self.revision},parallelize=True"  # ,max_length=4096"
         if self.precision in ["float16", "float32", "bfloat16"]:
             model_args += f",dtype={self.precision}"
             # Quantized models need some added config, the install of bits and bytes, etc
             # elif self.precision == "8bit":
             #    model_args += ",load_in_8bit=True"
+            # elif self.precision == "4bit":
+            #    model_args += ",load_in_4bit=True"
             # elif self.precision == "GPTQ":
             # A GPTQ model does not need dtype to be specified,
             # it will be inferred from the config
+            pass
         elif self.precision == "8bit":
             model_args += ",load_in_8bit=True"
+            model_args += ",trust_remote_code=True"
         else:
             raise Exception(f"Unknown precision {self.precision}.")
         return model_args

src/backend/moe_infinity.py CHANGED Viewed

@@ -31,20 +31,15 @@ class MoEHFLM(HFLMWithMeasurement):
         self.use_chat_template = use_chat_template
         if "device" in kwargs:
             kwargs.pop("device")
-        if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")):
-            shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
-        kwargs["device_map"] = "cuda:0"
         super().__init__(
-            *args, **kwargs, pretrained=pretrained
         )  # Assuming HFLM accepts a 'pretrained' arg and handles it
         # self._create_model()
     def __del__(self):
-        self._model.engine.clean_up() # clean up hooks
-        self._model.engine.archer_engine.clean_up_resources() # clean up resources
-        if os.path.exists(os.path.join(self.offload_path, "moe-infinity-offloads")):
-            shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads")) # clean up offload model
     def _create_model(self, *args, **kwargs):
         """

         self.use_chat_template = use_chat_template
         if "device" in kwargs:
             kwargs.pop("device")
         super().__init__(
+            *args, **kwargs, pretrained=pretrained, device_map="cuda:0"
         )  # Assuming HFLM accepts a 'pretrained' arg and handles it
         # self._create_model()
+        shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
     def __del__(self):
+        # Clean up offloaded models from self.offload_path
+        shutil.rmtree(os.path.join(self.offload_path, "moe-infinity-offloads"))
     def _create_model(self, *args, **kwargs):
         """

src/backend/run_eval_suite.py CHANGED Viewed

@@ -13,20 +13,16 @@ orig_higher_is_better = ConfigurableTask.higher_is_better
 def process_results_decorator(func):
     def wrapper(self, doc, results, *args, **kwargs):
         processed_results = [r[0] for r in results]
         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
-        mfu = sum([r[4] for r in results]) / len(results)
-        mbu = sum([r[5] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         result_dict = func(self, doc, processed_results, *args, **kwargs)
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
-        result_dict["mfu"] = mfu
-        result_dict["mbu"] = mbu
         return result_dict
     return wrapper
 ConfigurableTask.process_results = process_results_decorator(orig_process_results)
@@ -37,8 +33,6 @@ def aggregation_decorator(func):
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
-        aggregation_list["mfu"] = mean
-        aggregation_list["mbu"] = mean
         return aggregation_list
     return wrapper
 ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
@@ -49,8 +43,6 @@ def higher_is_better_decorator(func):
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
-        higher_is_better_dict["mfu"] = True
-        higher_is_better_dict["mbu"] = True
         return higher_is_better_dict
     return wrapper
 ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)

 def process_results_decorator(func):
     def wrapper(self, doc, results, *args, **kwargs):
         processed_results = [r[0] for r in results]
         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         result_dict = func(self, doc, processed_results, *args, **kwargs)
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
         return result_dict
     return wrapper
 ConfigurableTask.process_results = process_results_decorator(orig_process_results)
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
         return aggregation_list
     return wrapper
 ConfigurableTask.aggregation = aggregation_decorator(orig_aggregation)
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
         return higher_is_better_dict
     return wrapper
 ConfigurableTask.higher_is_better = higher_is_better_decorator(orig_higher_is_better)

src/backend/tasks/arena_hard/__init__.py DELETED Viewed

File without changes

src/backend/tasks/arena_hard/arena_hard.yaml DELETED Viewed

	@@ -1,2 +0,0 @@
1	- task: arena_hard
2	- class: !function task.ArenaHard

src/backend/tasks/arena_hard/arena_judgment.py DELETED Viewed

@@ -1,256 +0,0 @@
-'''
-This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
-under the Apache 2.0 License from the arena-hard project.
-(https://github.com/lm-sys/arena-hard)
-Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
-See the NOTICE file distributed with this work for additional
-information regarding copyright ownership.
-'''
-import pandas as pd
-from tqdm import tqdm
-import numpy as np
-from sklearn.linear_model import LogisticRegression
-import math
-from collections import defaultdict
-from tqdm import tqdm
-from src.backend.tasks.arena_hard.arena_utils import (
-    chat_completion_openai,
-    load_questions,
-    load_model_answers,
-    get_endpoint,
-    make_config,
-)
-def get_score(judgment, pattern, pairwise=True):
-    matches = pattern.findall(judgment)
-    matches = [m for m in matches if m != ""]
-    if len(set(matches)) == 0:
-        return None, True
-    elif len(set(matches)) == 1:
-        if pairwise:
-            return matches[0].strip("\n"), False
-        return int(matches[0])
-    else:
-        return None, False
-# get answer from model
-def get_answer(model, conv, temperature, max_tokens, endpoint_dict=None):
-    api_dict = get_endpoint(endpoint_dict["endpoints"])
-    # if endpoint_dict["api_type"] == "anthropic":
-    #     output = chat_completion_anthropic(model, conv, temperature, max_tokens)
-    # elif endpoint_dict["api_type"] == "azure":
-    #     output = chat_completion_openai_azure(model, conv, temperature, max_tokens, api_dict)
-    output = chat_completion_openai(model, conv, temperature, max_tokens, api_dict)
-    return output
-def judgment(**args):
-    question = args["question"]
-    answer = args["answer"]
-    reference = args["reference"]
-    baseline = args["baseline_answer"]
-    configs = args["configs"]
-    # output_file = args["output_file"]
-    model = configs["judge_model"]
-    num_games = 2 if configs["pairwise"] else 1
-    # output = {
-    #     "question_id":question["question_id"],
-    #     "judge": model,
-    #     "model": "custom_model",
-    #     "games":[]
-    #     }
-    output = [question["question_id"]]
-    for game in range(num_games):
-        conv = [{"role": "system", "content": configs["system_prompt"]}]
-        for template in configs["prompt_template"]:
-            prompt_args = {}
-            prompt_args[f"question_{1}"] = question["content"]
-            base = 1
-            if baseline:
-                if game % 2 == 1: # swap position
-                    temp = baseline
-                    baseline = answer
-                    answer = temp
-                if game == 0:
-                    for i, turn in enumerate(baseline["choices"][0]["turns"]):
-                        prompt_args[f"answer_{i+1}"] = turn["content"]
-                        base += 1
-                if game == 1:
-                    prompt_args[f"answer_{1}"] = baseline
-                    base += 1
-            if answer:
-                prompt_args[f"answer_{base}"] = answer
-            if reference:
-                for j, ref_answer in enumerate(reference):
-                    for i, turn in enumerate(ref_answer["choices"][0]["turns"]):
-                        prompt_args[f"ref_answer_{i+j+1}"] = turn["content"]
-            user_prompt = template.format(**prompt_args)
-            conv.append({"role": "user", "content": user_prompt})
-        judgment = ""
-        for _ in range(2):
-            new_judgment = get_answer(
-                model,
-                conv,
-                configs["temperature"],
-                configs["max_tokens"],
-                args["endpoint_dict"],
-            )
-            judgment += ("\n" + new_judgment)
-            score, try_again = get_score(judgment, args["regex_pattern"])
-            conv.append({"role": "assistant", "content": new_judgment})
-            if not try_again:
-                break
-            conv.append({"role": "user", "content": "continue your judgment and finish by outputting a final verdict label"})
-        print("Finish judgment!!!")
-        # result = {
-        #     "user_prompt": conv[1]["content"],
-        #     "judgment": judgment,
-        #     "score":score
-        # }
-        output.append(score)
-    return output
-def get_battles_from_scores(score_list, first_game_only=False, WEIGHT=3):
-    arena_hard_battles = pd.DataFrame()
-    print("Turning score list into battles...")
-    for scores in tqdm(score_list):
-        question_id, score1, score2 = scores
-        # Process game 1
-        output = {"question_id": question_id,
-                  "model_a": "gpt-4-0314",
-                  "model_b": f"custom_model"}  # Unique identifier for model
-        weight = 1
-        if score1 == "A=B":
-            output["winner"] = "tie"
-        elif score1 == "A>B":
-            output["winner"] = "model_a"
-        elif score1 == "A>>B":
-            output["winner"] = "model_a"
-            weight = WEIGHT
-        elif score1 == "B>A":
-            output["winner"] = "model_b"
-        elif score1 == "B>>A":
-            output["winner"] = "model_b"
-            weight = WEIGHT
-        else:
-            weight = 0
-        if weight:
-            arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
-        if not first_game_only:
-            # Process game 2
-            output = {"question_id": question_id,
-                      "model_a": "gpt-4-0314",
-                      "model_b": f"custom_model"}  # Unique identifier for model
-            weight = 1
-            if score2 == "A=B":
-                output["winner"] = "tie"
-            elif score2 == "A>B":
-                output["winner"] = "model_b"
-            elif score2 == "A>>B":
-                output["winner"] = "model_b"
-                weight = WEIGHT
-            elif score2 == "B>A":
-                output["winner"] = "model_a"
-            elif score2 == "B>>A":
-                output["winner"] = "model_a"
-                weight = WEIGHT
-            else:
-                weight = 0
-            if weight:
-                arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])
-    arena_hard_battles.to_json("./arena_hard_battles.jsonl", lines=True, orient="records")
-    return arena_hard_battles
-def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
-    models = pd.concat([df["model_a"], df["model_b"]]).unique()
-    models = pd.Series(np.arange(len(models)), index=models)
-    LOW_RATING = 100
-    # duplicate battles
-    df = pd.concat([df, df], ignore_index=True)
-    p = len(models.index)
-    n = df.shape[0]
-    X = np.zeros([n, p])
-    X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
-    X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)
-    # one A win => two A win
-    Y = np.zeros(n)
-    Y[df["winner"] == "model_a"] = 1.0
-    # one tie => one A win + one B win
-    # find tie + tie (both bad) index
-    tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
-    tie_idx[len(tie_idx)//2:] = False
-    Y[tie_idx] = 1.0
-    if len(np.unique(Y)) == 1:
-        # If there's only one class in the data, assign default ratings
-        elo_scores = np.full(p, LOW_RATING)
-        elo_scores[models["gpt-4-0314"]] = INIT_RATING
-    else:
-        lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
-        lr.fit(X,Y)
-        elo_scores = SCALE * lr.coef_[0] + INIT_RATING
-    # set anchor as gpt-4-0314 = 1000
-    if "gpt-4-0314" in models.index:
-        elo_scores += 1000 - elo_scores[models["gpt-4-0314"]]
-    return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)
-def predict_win_rate(elo_ratings, SCALE=400, BASE=10, INIT_RATING=1000):
-    names = sorted(list(elo_ratings.keys()))
-    wins = defaultdict(lambda: defaultdict(lambda: 0))
-    for a in names:
-        for b in names:
-            ea = 1 / (1 + BASE ** ((elo_ratings[b] - elo_ratings[a]) / SCALE))
-            wins[a][b] = ea
-            wins[b][a] = 1 - ea
-    data = {
-        a: [wins[a][b] if a != b else np.NAN for b in names]
-        for a in names
-    }
-    df = pd.DataFrame(data, index=names)
-    df.index.name = "model_a"
-    df.columns.name = "model_b"
-    return df.T
-def get_win_rate_column(df, column, baseline="gpt-4-0314"):
-    to_dict = df[["model", column]].set_index("model").to_dict()[column]
-    win_rate_table = predict_win_rate(to_dict)
-    return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))

src/backend/tasks/arena_hard/arena_utils.py DELETED Viewed

@@ -1,349 +0,0 @@
-'''
-This file is part of Open-MoE-LLM-Leaderboard and is modified based on work
-under the Apache 2.0 License from the arena-hard project.
-(https://github.com/lm-sys/arena-hard)
-Original Copyright (c) 2024 Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica
-See the NOTICE file distributed with this work for additional
-information regarding copyright ownership.
-'''
-import os
-import json
-import time
-import yaml
-import random
-from typing import Optional
-from glob import glob
-# API setting constants
-API_MAX_RETRY = 16
-API_RETRY_SLEEP = 10
-API_ERROR_OUTPUT = "$ERROR$"
-OPENAI_MODEL_LIST = (
-    "gpt-3.5-turbo",
-    "gpt-3.5-turbo-0301",
-    "gpt-3.5-turbo-0613",
-    "gpt-3.5-turbo-0613-verbose",
-    "gpt-3.5-turbo-1106",
-    "gpt-3.5-turbo-0125",
-    "gpt-4",
-    "gpt-4-0314",
-    "gpt-4-0613",
-    "gpt-4-turbo",
-    "gpt-4-1106-preview",
-    "gpt-4-0125-preview",
-)
-temperature_config = {
-    "writing": 0.7,
-    "roleplay": 0.7,
-    "extraction": 0.0,
-    "math": 0.0,
-    "coding": 0.0,
-    "reasoning": 0.0,
-    "stem": 0.1,
-    "humanities": 0.1,
-}
-def load_questions(question_file: str):
-    """Load questions from a file."""
-    questions = []
-    with open(question_file, "r") as ques_file:
-        for line in ques_file:
-            if line:
-                questions.append(json.loads(line))
-    return questions
-def load_model_answers(answer_dir: str):
-    """Load model answers.
-    The return value is a python dict of type:
-    Dict[model_name: str -> Dict[question_id: int -> answer: dict]]
-    """
-    filenames = glob(os.path.join(answer_dir, "*.jsonl"))
-    filenames.sort()
-    model_answers = {}
-    for filename in filenames:
-        model_name = os.path.basename(filename)[:-6]
-        answer = {}
-        with open(filename) as fin:
-            for line in fin:
-                line = json.loads(line)
-                answer[line["question_id"]] = line
-        model_answers[model_name] = answer
-    return model_answers
-def get_endpoint(endpoint_list):
-    if endpoint_list is None:
-        return None
-    assert endpoint_list is not None
-    # randomly pick one
-    api_dict = random.choices(
-        endpoint_list
-    )[0]
-    return api_dict
-# load config args from config yaml files
-def make_config(config_file: str) -> dict:
-    config_kwargs = {}
-    with open(config_file, "r") as f:
-        config_kwargs = yaml.load(f, Loader=yaml.SafeLoader)
-    return config_kwargs
-def chat_completion_openai(model, messages, temperature, max_tokens, api_dict=None):
-    import openai
-    if api_dict:
-        client = openai.OpenAI(
-            base_url=api_dict["api_base"],
-            api_key=api_dict["api_key"],
-        )
-    else:
-        client = openai.OpenAI()
-    output = API_ERROR_OUTPUT
-    for _ in range(API_MAX_RETRY):
-        try:
-            # print(messages)
-            completion = client.chat.completions.create(
-                model=model,
-                messages=messages,
-                temperature=temperature,
-                max_tokens=max_tokens
-                )
-            output = completion.choices[0].message.content
-            break
-        except openai.RateLimitError as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-        except openai.BadRequestError as e:
-            print(messages)
-            print(type(e), e)
-        except KeyError:
-            print(type(e), e)
-            break
-    return output
-# def chat_completion_openai_azure(model, messages, temperature, max_tokens, api_dict=None):
-#     import openai
-#     from openai import AzureOpenAI
-#     api_base = api_dict["api_base"]
-#     client = AzureOpenAI(
-#         azure_endpoint = api_base,
-#         api_key= api_dict["api_key"],
-#         api_version=api_dict["api_version"],
-#         timeout=240,
-#         max_retries=2
-#     )
-#     output = API_ERROR_OUTPUT
-#     for _ in range(API_MAX_RETRY):
-#         try:
-#             response = client.chat.completions.create(
-#                 model=model,
-#                 messages=messages,
-#                 n=1,
-#                 temperature=temperature,
-#                 max_tokens=max_tokens,
-#                 seed=42,
-#             )
-#             output = response.choices[0].message.content
-#             break
-#         except openai.RateLimitError as e:
-#             print(type(e), e)
-#             time.sleep(API_RETRY_SLEEP)
-#         except openai.BadRequestError as e:
-#             print(type(e), e)
-#             break
-#         except KeyError:
-#             print(type(e), e)
-#             break
-#     return output
-# def chat_completion_anthropic(model, messages, temperature, max_tokens, api_dict=None):
-#     import anthropic
-#     if api_dict:
-#         api_key = api_dict["api_key"]
-#     else:
-#         api_key = os.environ["ANTHROPIC_API_KEY"]
-#     sys_msg = ""
-#     if messages[0]["role"] == "system":
-#         sys_msg = messages[0]["content"]
-#         messages = messages[1:]
-#     output = API_ERROR_OUTPUT
-#     for _ in range(API_MAX_RETRY):
-#         try:
-#             # print(sys_msg)
-#             c = anthropic.Anthropic(api_key=api_key)
-#             response = c.messages.create(
-#                 model=model,
-#                 messages=messages,
-#                 stop_sequences=[anthropic.HUMAN_PROMPT],
-#                 max_tokens=max_tokens,
-#                 temperature=temperature,
-#                 system=sys_msg
-#             )
-#             output = response.content[0].text
-#             break
-#         except anthropic.APIError as e:
-#             print(type(e), e)
-#             time.sleep(API_RETRY_SLEEP)
-#     return output
-# def chat_completion_mistral(model, messages, temperature, max_tokens):
-#     from mistralai.client import MistralClient
-#     from mistralai.models.chat_completion import ChatMessage
-#     from mistralai.exceptions import MistralException
-#     api_key = os.environ["MISTRAL_API_KEY"]
-#     client = MistralClient(api_key=api_key)
-#     prompts = [ChatMessage(role=message["role"], content=message["content"]) for message in messages]
-#     output = API_ERROR_OUTPUT
-#     for _ in range(API_MAX_RETRY):
-#         try:
-#             chat_response = client.chat(
-#                 model=model,
-#                 messages=prompts,
-#                 temperature=temperature,
-#                 max_tokens=max_tokens,
-#             )
-#             output = chat_response.choices[0].message.content
-#             break
-#         except MistralException as e:
-#             print(type(e), e)
-#             break
-#     return output
-# def chat_completion_gemini(model, messages, temperature, max_tokens):
-#     import google.generativeai as genai
-#     genai.configure(api_key=os.environ["GEMINI_API_KEY"])
-#     safety_settings = [
-#         {
-#             "category": "HARM_CATEGORY_HARASSMENT",
-#             "threshold": "BLOCK_NONE"
-#         },
-#         {
-#             "category": "HARM_CATEGORY_HATE_SPEECH",
-#             "threshold": "BLOCK_NONE"
-#         },
-#         {
-#             "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
-#             "threshold": "BLOCK_NONE"
-#         },
-#         {
-#             "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
-#             "threshold": "BLOCK_NONE"
-#         },
-#     ]
-#     # Set up the model
-#     generation_config = {
-#         "temperature": temperature,
-#         "top_p": 1,
-#         "top_k": 1,
-#         "max_output_tokens": max_tokens,
-#     }
-#     output = API_ERROR_OUTPUT
-#     for _ in range(API_MAX_RETRY):
-#         try:
-#             gemini = genai.GenerativeModel(
-#                 model_name=model,
-#                 generation_config=generation_config,
-#                 safety_settings=safety_settings)
-#             convo = gemini.start_chat(history=[])
-#             convo.send_message(messages)
-#             output = convo.last.text
-#             break
-#         except genai.types.generation_types.StopCandidateException as e:
-#             print(type(e), e)
-#             break
-#         except Exception as e:
-#             print(type(e), e)
-#             time.sleep(API_RETRY_SLEEP)
-#     return output
-# def chat_completion_cohere(model, messages, temperature, max_tokens):
-#     import cohere
-#     co = cohere.Client(os.environ["COHERE_API_KEY"])
-#     assert len(messages) > 0
-#     template_map = {"system":"SYSTEM",
-#                     "assistant":"CHATBOT",
-#                     "user":"USER"}
-#     assert messages[-1]["role"] == "user"
-#     prompt = messages[-1]["content"]
-#     if len(messages) > 1:
-#         history = []
-#         for message in messages[:-1]:
-#             history.append({"role":template_map[message["role"]], "message":message["content"]})
-#     else:
-#         history = None
-#     output = API_ERROR_OUTPUT
-#     for _ in range(API_MAX_RETRY):
-#         try:
-#             response = co.chat(
-#                 message=prompt,
-#                 model=model,
-#                 temperature=temperature,
-#                 max_tokens=max_tokens,
-#                 chat_history=history,
-#             )
-#             output = response.text
-#             break
-#         except cohere.core.api_error.ApiError as e:
-#             print(type(e), e)
-#             raise
-#         except Exception as e:
-#             print(type(e), e)
-#             break
-#     return output
-def reorg_answer_file(answer_file):
-    """Sort by question id and de-duplication"""
-    answers = {}
-    with open(answer_file, "r") as fin:
-        for l in fin:
-            qid = json.loads(l)["question_id"]
-            answers[qid] = l
-    qids = sorted(list(answers.keys()))
-    with open(answer_file, "w") as fout:
-        for qid in qids:
-            fout.write(answers[qid])

src/backend/tasks/arena_hard/configs/api_config.yaml DELETED Viewed

@@ -1,17 +0,0 @@
-# gpt-3.5-turbo:
-#     model_name: gpt-3.5-turbo
-#     endpoints: null
-#     api_type: openai
-#     parallel: 8
-gpt-4-1106-preview:
-    model_name: gpt-4-1106-preview
-    endpoints: null
-    api_type: openai
-    parallel: 8
-# llama3-7b:
-#     model_name: llama3-7b
-#     endpoints: null
-#     api_type: openai
-#     parallel: 8

src/backend/tasks/arena_hard/configs/judge_config.yaml DELETED Viewed

@@ -1,26 +0,0 @@
-name: judgment config file for Arena Hard
-bench_name: arena-hard-v0.1
-# Arena Hard default
-judge_model: gpt-4-1106-preview
-# judge_model: gpt-3.5-turbo
-reference: False # Optional
-ref_model: null
-baseline: True
-baseline_model: gpt-4-0314
-pairwise: True
-temperature: 0
-max_tokens: 4096
-regex_pattern: \[\[([AB<>=]+)\]\]
-system_prompt: "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user prompt displayed below. You will be given assistant A's answer and assistant B's answer. Your job is to evaluate which assistant's answer is better.\n\nBegin your evaluation by generating your own answer to the prompt. You must provide your answers before judging any answers.\n\nWhen evaluating the assistants' answers, compare both assistants' answers with your answer. You must identify and correct any mistakes or inaccurate information.\n\nThen consider if the assistant's answers are helpful, relevant, and concise. Helpful means the answer correctly responds to the prompt or follows the instructions. Note when user prompt has any ambiguity or more than one interpretation, it is more helpful and appropriate to ask for clarifications or more information from the user than providing an answer based on assumptions. Relevant means all parts of the response closely connect or are appropriate to what is being asked. Concise means the response is clear and not verbose or excessive.\n\nThen consider the creativity and novelty of the assistant's answers when needed. Finally, identify any missing important information in the assistants' answers that would be beneficial to include when responding to the user prompt.\n\nAfter providing your explanation, you must output only one of the following choices as your final verdict with a label:\n\n1. Assistant A is significantly better: [[A>>B]]\n2. Assistant A is slightly better: [[A>B]]\n3. Tie, relatively the same: [[A=B]]\n4. Assistant B is slightly better: [[B>A]]\n5. Assistant B is significantly better: [[B>>A]]\n\nExample output: \"My final verdict is tie: [[A=B]]\"."
-prompt_template: ["<|User Prompt|>\n{question_1}\n\n<|The Start of Assistant A's Answer|>\n{answer_1}\n<|The End of Assistant A's Answer|>\n\n<|The Start of Assistant B's Answer|>\n{answer_2}\n<|The End of Assistant B's Answer|>"]
-# Add your model below for evaluation
-# model_list:
-#   - gpt-3.5-turbo-0125

src/backend/tasks/arena_hard/model_answer/gpt-4-0314.jsonl DELETED Viewed

The diff for this file is too large to render. See raw diff

src/backend/tasks/arena_hard/question.jsonl DELETED Viewed

The diff for this file is too large to render. See raw diff

src/backend/tasks/arena_hard/task.py DELETED Viewed

@@ -1,220 +0,0 @@
-import os
-from typing import Union, List
-from lm_eval.api.task import ConfigurableTask
-from lm_eval.api.instance import Instance
-# from lm_eval.api.registry import register_task
-from lm_eval.api.metrics import mean
-from src.backend.envs import DEVICE
-import pandas as pd
-from src.backend.tasks.measurement_task_utils import measure_system_metrics
-import json
-from typing import (
-    Any,
-    Dict,
-    List,
-    Optional,
-    Union,
-)
-from datasets import Dataset
-import re
-from src.backend.tasks.arena_hard.arena_utils import (
-    load_questions,
-    load_questions,
-    load_model_answers,
-    make_config,
-)
-from src.backend.tasks.arena_hard.arena_judgment import (
-    judgment,
-    get_battles_from_scores,
-    compute_mle_elo,
-    predict_win_rate,
-    get_win_rate_column
-)
-def load_questions(question_file: str):
-    """Load questions from a file."""
-    questions = []
-    with open(question_file, "r") as ques_file:
-        for line in ques_file:
-            if line:
-                questions.append(json.loads(line))
-    return questions
-def download_wrapper(func):
-    def download(self, *args, **kwargs):
-        print("Using Arena Hard, No need to download")
-    return download
-original_download = ConfigurableTask.download
-ConfigurableTask.download = download_wrapper(original_download)
-# @register_task("selfcheckgpt")
-@measure_system_metrics
-class ArenaHard(ConfigurableTask):
-    VERSION = 0.0
-    OUTPUT_TYPE = "generate_until"
-    data_path = os.path.join(os.path.dirname(__file__), 'question.jsonl')
-    judge_config_path = os.path.join(os.path.dirname(__file__), "configs/judge_config.yaml")
-    configs = make_config(judge_config_path)
-    model_ans_dir = os.path.join(os.path.dirname(__file__), "model_answer")
-    model_answers = load_model_answers(model_ans_dir)
-    data = load_questions(data_path)
-    def __init__(self):
-        super().__init__(config={"metadata": {"version": self.VERSION}})
-        # these end tokens are hard coded because of the current limitaion of the llm-eval.
-        # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
-        self.generation_kwargs = {"until": ["</s>", "<|im_end|>"], "max_gen_toks": 4096}
-        # self.generation_kwargs_sampling_number = 5  # the number of sampling for self-consistence
-        # self.generation_kwargs_sampling = {
-        #     "temperature": 0.99,
-        #     "do_sample": True,
-        #     "until": ["<im_end>", "<im_end>"],
-        #     "max_length": 1024,
-        # }
-    def transform_data(self, data):
-        transformed_data = []
-        for i in range(len(data)):
-            if self.configs["baseline"]:
-                baseline_answer = self.model_answers[self.configs["baseline_model"]][data[i]["question_id"]]
-            else:
-                baseline_answer = None
-            transformed_item = {
-                "question_id": data[i]["question_id"],
-                "content": data[i]["turns"][0]["content"],  # Assuming you want the first turn's content
-                "model_answer": baseline_answer
-            }
-            transformed_data.append(transformed_item)
-        return transformed_data
-    def has_training_docs(self):
-        return False
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return False
-    def validation_docs(self):
-        self.dataset = self.transform_data(self.data)
-        self.dataset = Dataset.from_dict({"question_id": [item["question_id"] for item in self.dataset],
-                             "content": [item["content"] for item in self.dataset],
-                             "model_answer": [item["model_answer"] for item in self.dataset]})
-        return self.dataset
-    def doc_to_text(self, doc):
-        sentence = doc["content"]
-        doc_text = f"{sentence}\n"
-        return doc_text
-    def doc_to_target(self, doc):
-        q_id = doc["question_id"]
-        return q_id
-    def construct_requests(self, doc: dict, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
-        arguments = (ctx, self.generation_kwargs)
-        request_list = [
-            Instance(request_type="generate_until", doc=doc, arguments=arguments, idx=0, **kwargs),
-        ]
-        # sampling_arguments = (ctx, self.generation_kwargs_sampling)
-        # request_list.extend(
-        #     [
-        #         Instance(request_type="generate_until", doc=doc, arguments=sampling_arguments, idx=idx, **kwargs)
-        #         for idx in range(1, self.generation_kwargs_sampling_number + 1)
-        #     ]
-        # )
-        return request_list
-    def process_results(self, doc, results):
-        response_temperature_0 = results[0]
-        # other_responses = results[1:]
-        api_config_path = os.path.join(os.path.dirname(__file__), "configs/api_config.yaml")
-        endpoint_list = make_config(api_config_path)
-        if self.configs["regex_pattern"]:
-            pattern = re.compile(self.configs["regex_pattern"])
-        ref_answer_dir = os.path.join(os.path.dirname(__file__), "reference_answer")
-        ref_answers = None
-        if self.configs["reference"]:
-            ref_answers = load_model_answers(ref_answer_dir)
-            ref_answers = [ref_answers[model] for model in self.configs["ref_model"]]
-        # output_files = {}
-        # models = ["custom_model"]
-        # output_dir = f"{os.path.join(os.path.dirname(__file__))}/model_judgments/{self.configs['judge_model']}"
-        # for model in models:
-        #     output_files[model] = os.path.join(
-        #         output_dir,
-        #         f"{model}.jsonl",
-        #     )
-        # for output_file in output_files.values():
-        #     os.makedirs(os.path.dirname(output_file), exist_ok=True)
-        endpoint_info = endpoint_list[self.configs["judge_model"]]
-        question = doc
-        kwargs = {}
-        kwargs["question"] = question
-        kwargs["answer"] = response_temperature_0
-        if ref_answers:
-            kwargs["reference"] = [ref_answer[doc["question_id"]] for ref_answer in ref_answers]
-            assert len(kwargs["reference"]) == len(self.configs["ref_model"])
-        else:
-            kwargs["reference"] = None
-        if self.configs["baseline"]:
-            kwargs["baseline_answer"] = doc["model_answer"]
-        else:
-            kwargs["baseline_answer"] = None
-        kwargs["configs"] = self.configs
-        kwargs["endpoint_dict"] = endpoint_info
-        # kwargs["output_file"] = output_files["custom_model"]
-        kwargs["regex_pattern"] = pattern
-        scores = judgment(**kwargs)
-        return {"score": scores}
-    def aggregation(self):
-        """
-        :returns: {str: [float] -> float}
-            A dictionary where keys are the names of submetrics and values are
-            functions that aggregate a list of metrics
-        """
-        ##TODO implement the aggregation function to calculate elo for score
-        def get_win_rate(score_list):
-            battles = get_battles_from_scores(score_list)
-            bootstrap_online_elo = compute_mle_elo(battles)
-            stats = pd.DataFrame()
-            stats["results"] = None
-            stats["results"] = stats['results'].astype('object')
-            for i, model in enumerate(bootstrap_online_elo.index):
-                stats.at[i, "model"] = model
-                stats.at[i, "score"] = bootstrap_online_elo[model]
-            stats.sort_values(by="model", inplace=True)
-            stats["score"] = get_win_rate_column(stats, "score", "gpt-4-0314").tolist()
-            return stats["score"][1]
-        return {k: get_win_rate for k in ["score"]}
-    def higher_is_better(self):
-        """
-        :returns: {str: bool}
-            A dictionary where keys are the names of submetrics and values are
-            whether a higher value of the submetric is better
-        """
-        return {k: True for k in ["score"]}

src/backend/tasks/gsm8k/gsm8k-custom.yaml DELETED Viewed

@@ -1,47 +0,0 @@
-group:
-  - math_word_problems
-task: gsm8k_custom
-dataset_path: gsm8k
-dataset_name: main
-output_type: generate_until
-training_split: train
-fewshot_split: train
-test_split: test
-doc_to_text: "Question: {{question}}\nAnswer:"
-doc_to_target: "{{answer}}" #" {{answer.split('### ')[-1].rstrip()}}"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: false
-    regexes_to_ignore:
-      - ","
-      - "\\$"
-      - "(?s).*#### "
-      - "\\.$"
-generation_kwargs:
-  until:
-    - "Question:"
-    - "Question"
-    - "</s>"
-    - "<|im_end|>"
-  do_sample: false
-  temperature: 0.0
-  # is_gsm8k: true
-repeats: 1
-num_fewshot: 5
-filter_list:
-  - name: "strict-match"
-    filter:
-      - function: "regex"
-        regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
-      - function: "take_first"
-  - name: "flexible-extract"
-    filter:
-      - function: "regex"
-        group_select: -1
-        regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
-      - function: "take_first"
-metadata:
-  version: 3.0

src/backend/tasks/measurement_task_utils.py CHANGED Viewed

@@ -12,9 +12,6 @@ def process_results_decorator(func):
         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
-        mfu = sum([r[4] for r in results]) / len(results)
-        mbu = sum([r[5] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         # Now call the original process_results with the processed results
@@ -22,8 +19,6 @@ def process_results_decorator(func):
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
-        result_dict["mfu"] = mfu
-        result_dict["mbu"] = mbu
         return result_dict
     return wrapper
@@ -35,8 +30,6 @@ def aggregation_decorator(func):
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
-        aggregation_list["mfu"] = mean
-        aggregation_list["mbu"] = mean
         return aggregation_list
     return wrapper
@@ -48,8 +41,6 @@ def higher_is_better_decorator(func):
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
-        higher_is_better_dict["mfu"] = True
-        higher_is_better_dict["mbu"] = True
         return higher_is_better_dict
     return wrapper

         end_to_end_time = sum([r[1] for r in results]) / len(results)
         prefilling_time = sum([r[2] for r in results]) / len(results)
         decoding_throughput = sum([r[3] for r in results]) / len(results)
         # print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
         # Now call the original process_results with the processed results
         result_dict["end_to_end_time"] = end_to_end_time
         result_dict["prefilling_time"] = prefilling_time
         result_dict["decoding_throughput"] = decoding_throughput
         return result_dict
     return wrapper
         aggregation_list["end_to_end_time"] = mean
         aggregation_list["prefilling_time"] = mean
         aggregation_list["decoding_throughput"] = mean
         return aggregation_list
     return wrapper
         higher_is_better_dict["end_to_end_time"] = False
         higher_is_better_dict["prefilling_time"] = False
         higher_is_better_dict["decoding_throughput"] = True
         return higher_is_better_dict
     return wrapper

src/backend/tasks/selfcheckgpt/task.py CHANGED Viewed

@@ -27,12 +27,12 @@ class SelfCheckGPT(ConfigurableTask):
         super().__init__(config={"metadata": {"version": self.VERSION}})
         # these end tokens are hard coded because of the current limitaion of the llm-eval.
         # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
-        self.generation_kwargs = {"until": ["<|im_end|>"], "max_length": 1024}
         self.generation_kwargs_sampling_number = 5  # the number of sampling for self-consistence
         self.generation_kwargs_sampling = {
             "temperature": 0.99,
             "do_sample": True,
-            "until": ["<|im_end|>", "</s>"],
             "max_length": 1024,
         }

         super().__init__(config={"metadata": {"version": self.VERSION}})
         # these end tokens are hard coded because of the current limitaion of the llm-eval.
         # self.generation_kwargs = {"until": ["\n\n", "<unk>", "<|im_end|>", "</s>", "<|endoftext|>"], "max_length": 512}
+        self.generation_kwargs = {"until": ["<im_end>"], "max_length": 1024}
         self.generation_kwargs_sampling_number = 5  # the number of sampling for self-consistence
         self.generation_kwargs_sampling = {
             "temperature": 0.99,
             "do_sample": True,
+            "until": ["<im_end>", "</s>"],
             "max_length": 1024,
         }

src/display/about.py CHANGED Viewed

@@ -3,36 +3,23 @@ from src.display.utils import ModelType
 TITLE = """<h1 align="center" id="space-title">OPEN-MOE-LLM-LEADERBOARD</h1>"""
 INTRODUCTION_TEXT = """
-The OPEN-MOE-LLM-LEADERBOARD is specifically designed to assess the performance and efficiency of various Mixture of Experts (MoE) Large Language Models (LLMs).
-This initiative, driven by the open-source community, aims to comprehensively evaluate these advanced MoE LLMs.
 The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to measure the performance and efficiency of MOE LLMs.
 Tasks:
 - **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
-- **Mathematics Problem-Solving Performance** -- [GSM8K](https://arxiv.org/abs/2110.14168)
-- **AI Judgment Scores for Responses to Complex User Queries** -- [Arena_Hard](https://lmsys.org/blog/2024-04-19-arena-hard/)
 Columns and Metrics:
 - Method: The MOE LLMs inference framework.
 - E2E(s): Average End to End generation time in seconds.
 - PRE(s): Prefilling Time of input prompt in seconds.
 - T/s: Tokens throughout per second.
-- S-MBU(%): Sparse Model Bandwidth Utilization.
-- S-MFU(%): Sparse Model FLOPs Utilization.
 - Precision: The precison of used model.
 """
-ACKNOWLEDGEMENT_TEXT = """
-<div>
-    <h4>Acknowledgements</h4>
-    {image_html}
-    <p>We express our sincere gratitude to <a href="https://netmind.ai/home">NetMind.AI</a> for their generous donation of GPUs, which plays a crucial role in ensuring the continuous operation of our Leaderboard.</p>
-</div>
-"""
 LLM_BENCHMARKS_TEXT = f"""
 """

 TITLE = """<h1 align="center" id="space-title">OPEN-MOE-LLM-LEADERBOARD</h1>"""
 INTRODUCTION_TEXT = """
+The OPEN-MOE-LLM-LEADERBOARD is specifically designed to assess the performance and efficiency of various Mixture of Experts (MoE) Large Language Models (LLMs). This initiative, driven by the open-source community, aims to comprehensively evaluate these advanced MoE LLMs. We extend our gratitude to the Huggingface for the GPU community grant that supported the initial debugging process, and to [NetMind.AI](https://netmind.ai/home) for their generous GPU donation, which ensures the continuous operation of the Leaderboard.
 The OPEN-MOE-LLM-LEADERBOARD includes generation and multiple choice tasks to measure the performance and efficiency of MOE LLMs.
 Tasks:
+- **Generation Self-consistancy** -- [SelfCheckGPT](https://github.com/potsawee/selfcheckgpt)
 - **Multiple Choice Performance** -- [MMLU](https://arxiv.org/abs/2009.03300)
 Columns and Metrics:
 - Method: The MOE LLMs inference framework.
 - E2E(s): Average End to End generation time in seconds.
 - PRE(s): Prefilling Time of input prompt in seconds.
 - T/s: Tokens throughout per second.
 - Precision: The precison of used model.
 """
 LLM_BENCHMARKS_TEXT = f"""
 """

src/display/imgs/Netmind.AI_LOGO.jpg DELETED Viewed

Binary file (6.92 kB)

src/display/utils.py CHANGED Viewed

@@ -13,33 +13,6 @@ TS = "T/s" #Decoding throughput (tok/s)
 InFrame = "Method" #"Inference framework"
 MULTIPLE_CHOICEs = ["mmlu"]
-GPU_TEMP = 'Temp(C)'
-GPU_Power = 'Power(W)'
-GPU_Mem = 'Mem(G)'
-GPU_Name = "GPU"
-GPU_Util = 'Util(%)'
-MFU = 'S-MFU(%)'
-MBU = 'S-MBU(%)'
-BATCH_SIZE = 'bs'
-PRECISION = "Precision"
-system_metrics_to_name_map = {
-    "end_to_end_time": f"{E2Es}",
-    "prefilling_time": f"{PREs}",
-    "decoding_throughput": f"{TS}",
-    "mfu": f"{MFU}",
-    "mbu": f"{MBU}"
-}
-gpu_metrics_to_name_map = {
-    GPU_Util: GPU_Util,
-    GPU_TEMP: GPU_TEMP,
-    GPU_Power: GPU_Power,
-    GPU_Mem: GPU_Mem,
-    "batch_size": BATCH_SIZE,
-    "precision": PRECISION,
-    GPU_Name: GPU_Name
-}
 @dataclass
 class Task:
     benchmark: str
@@ -77,11 +50,8 @@ class Tasks(Enum):
     # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
     # # XXX include me back at some point
-    # selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
     mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
-    gsm8k = Task("gsm8k_custom", "em", "GSM8K") #GSM8K/EM (5-shot)
-    # gsm8k_cot = Task("gsm8k_cot", "em", "GSM8K COT") #GSM8K COT/EM (5-shot)
-    arena_hard = Task("arena_hard", "score", "Arena Hard") #Arena Hard/Score
 # These classes are for user facing column names,
@@ -106,35 +76,27 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
 # # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
 # Inference framework
-auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True, dummy=True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
     # System performance metrics
-    auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name} {E2Es}", "number", True, hidden=True)])
-    auto_eval_column_dict.append([f"{task.name}_batch_size", ColumnContent, ColumnContent(f"{task.value.col_name} {BATCH_SIZE}", "number", True, hidden=True)])
-    # auto_eval_column_dict.append([f"{task.name}_precision", ColumnContent, ColumnContent(f"{task.value.col_name} {PRECISION}", "str", True, hidden=True)])
-    # auto_eval_column_dict.append([f"{task.name}_gpu_mem", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Mem}", "number", True, hidden=True)])
-    auto_eval_column_dict.append([f"{task.name}_gpu", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Name}", "str", True, hidden=True)])
-    # auto_eval_column_dict.append([f"{task.name}_gpu_util", ColumnContent, ColumnContent(f"{task.value.col_name} {GPU_Util}", "number", True, hidden=True)])
     if task.value.benchmark in MULTIPLE_CHOICEs:
         continue
-    # auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name} {PREs}", "number", False, hidden=True)])
-    auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name} {TS}", "number", True, hidden=True)])
-    auto_eval_column_dict.append([f"{task.name}_mbu", ColumnContent, ColumnContent(f"{task.value.col_name} {MBU}", "number", True, hidden=True)])
-    auto_eval_column_dict.append([f"{task.name}_mfu", ColumnContent, ColumnContent(f"{task.value.col_name} {MFU}", "number", True, hidden=True)])
 # Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False, dummy=True)])
-# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True, dummy=True)])
-# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
-# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
@@ -160,10 +122,10 @@ class ModelDetails:
 class ModelType(Enum):
-    # PT = ModelDetails(name="pretrained", symbol="🟢")
-    # FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
     chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
-    # merges = ModelDetails(name="base merges and moerges", symbol="🤝")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
@@ -171,24 +133,21 @@ class ModelType(Enum):
     @staticmethod
     def from_str(type):
-        # if "fine-tuned" in type or "🔶" in type:
-        #     return ModelType.FT
-        # if "pretrained" in type or "🟢" in type:
-        #     return ModelType.PT
         if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
             return ModelType.chat
-        # if "merge" in type or "🤝" in type:
-        #     return ModelType.merges
         return ModelType.Unknown
 class InferenceFramework(Enum):
     # "moe-infinity", hf-chat
-    # MoE_Infinity = ModelDetails("moe-infinity")
     HF_Chat = ModelDetails("hf-chat")
-    VLLM = ModelDetails("vllm_moe")
-    TRTLLM = ModelDetails("tensorrt_llm")
-    VLLM_FIX = ModelDetails("vllm_moe_fixbs")
     Unknown = ModelDetails("?")
     def to_str(self):
@@ -196,34 +155,13 @@ class InferenceFramework(Enum):
     @staticmethod
     def from_str(inference_framework: str):
-        # if inference_framework in ["moe-infinity"]:
-        #     return InferenceFramework.MoE_Infinity
-        if inference_framework in ["tensorrt_llm"]:
-            return InferenceFramework.TRTLLM
         if inference_framework in ["hf-chat"]:
             return InferenceFramework.HF_Chat
-        if inference_framework in ["vllm_moe"]:
-            return InferenceFramework.VLLM
-        if inference_framework in ["vllm_moe_fixbs"]:
-            return InferenceFramework.VLLM_FIX
         return InferenceFramework.Unknown
-class GPUType(Enum):
-    A100_sxm = ModelDetails("NVIDIA-A100-SXM4-80GB")
-    A100_pcie = ModelDetails("NVIDIA-A100-PCIe-80GB")
-    Unknown = ModelDetails("?")
-    def to_str(self):
-        return self.value.name
-    @staticmethod
-    def from_str(gpu_type: str):
-        if gpu_type in ["NVIDIA-A100-PCIe-80GB"]:
-            return GPUType.A100_pcie
-        if gpu_type in ["NVIDIA-A100-SXM4-80GB"]:
-            return GPUType.A100_sxm
-        return GPUType.Unknown
 class WeightType(Enum):
     Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")
@@ -231,34 +169,34 @@ class WeightType(Enum):
 class Precision(Enum):
-    # float32 = ModelDetails("float32")
-    # float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     qt_8bit = ModelDetails("8bit")
     qt_4bit = ModelDetails("4bit")
-    # qt_GPTQ = ModelDetails("GPTQ")
     Unknown = ModelDetails("?")
     @staticmethod
     def from_str(precision: str):
-        # if precision in ["torch.float32", "float32"]:
-        #     return Precision.float32
-        # if precision in ["torch.float16", "float16"]:
-        #     return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
         if precision in ["8bit"]:
             return Precision.qt_8bit
         if precision in ["4bit"]:
             return Precision.qt_4bit
-        # if precision in ["GPTQ", "None"]:
-        #     return Precision.qt_GPTQ
         return Precision.Unknown
 # Column selection
-COLS = [c.name for c in fields(AutoEvalColumn)]
-TYPES = [c.type for c in fields(AutoEvalColumn)]
 COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]

 InFrame = "Method" #"Inference framework"
 MULTIPLE_CHOICEs = ["mmlu"]
 @dataclass
 class Task:
     benchmark: str
     # halueval_dial = Task("halueval_dialogue", "acc", "HaluDial/Acc")
     # # XXX include me back at some point
+    selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
     mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
 # These classes are for user facing column names,
 # # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
 # Inference framework
+auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
     # System performance metrics
+    auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name}-{E2Es}", "number", True)])
     if task.value.benchmark in MULTIPLE_CHOICEs:
         continue
+    auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name}-{PREs}", "number", True)])
+    auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name}-{TS}", "number", True)])
 # Model information
+auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
+auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # Dummy column for the search bar (hidden by the custom CSS)
 auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
 class ModelType(Enum):
+    PT = ModelDetails(name="pretrained", symbol="🟢")
+    FT = ModelDetails(name="fine-tuned on domain-specific datasets", symbol="🔶")
     chat = ModelDetails(name="chat models (RLHF, DPO, IFT, ...)", symbol="💬")
+    merges = ModelDetails(name="base merges and moerges", symbol="🤝")
     Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
     @staticmethod
     def from_str(type):
+        if "fine-tuned" in type or "🔶" in type:
+            return ModelType.FT
+        if "pretrained" in type or "🟢" in type:
+            return ModelType.PT
         if any([k in type for k in ["instruction-tuned", "RL-tuned", "chat", "🟦", "⭕", "💬"]]):
             return ModelType.chat
+        if "merge" in type or "🤝" in type:
+            return ModelType.merges
         return ModelType.Unknown
 class InferenceFramework(Enum):
     # "moe-infinity", hf-chat
+    MoE_Infinity = ModelDetails("moe-infinity")
     HF_Chat = ModelDetails("hf-chat")
     Unknown = ModelDetails("?")
     def to_str(self):
     @staticmethod
     def from_str(inference_framework: str):
+        if inference_framework in ["moe-infinity"]:
+            return InferenceFramework.MoE_Infinity
         if inference_framework in ["hf-chat"]:
             return InferenceFramework.HF_Chat
         return InferenceFramework.Unknown
 class WeightType(Enum):
     Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")
 class Precision(Enum):
+    float32 = ModelDetails("float32")
+    float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
     qt_8bit = ModelDetails("8bit")
     qt_4bit = ModelDetails("4bit")
+    qt_GPTQ = ModelDetails("GPTQ")
     Unknown = ModelDetails("?")
     @staticmethod
     def from_str(precision: str):
+        if precision in ["torch.float32", "float32"]:
+            return Precision.float32
+        if precision in ["torch.float16", "float16"]:
+            return Precision.float16
         if precision in ["torch.bfloat16", "bfloat16"]:
             return Precision.bfloat16
         if precision in ["8bit"]:
             return Precision.qt_8bit
         if precision in ["4bit"]:
             return Precision.qt_4bit
+        if precision in ["GPTQ", "None"]:
+            return Precision.qt_GPTQ
         return Precision.Unknown
 # Column selection
+COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]

src/leaderboard/read_evals.py CHANGED Viewed

@@ -65,11 +65,11 @@ class EvalResult:
         if len(org_and_model) == 1:
             org = None
             model = org_and_model[0]
-            result_key = f"{model}_{precision.value.name}_{inference_framework}"
         else:
             org = org_and_model[0]
             model = org_and_model[1]
-            result_key = f"{org}_{model}_{precision.value.name}_{inference_framework}"
         full_model = "/".join(org_and_model)
         still_on_hub, error, model_config = is_model_on_hub(
@@ -103,13 +103,6 @@ class EvalResult:
                 if to_add is True:
                     multiplier = 100.0
-                    if "GPU" in metric:
-                        results[benchmark][metric] = value
-                        continue
-                    if "precision" in metric:
-                        results[benchmark][metric] = value
-                        continue
                     if "rouge" in metric and "truthful" not in benchmark:
                         multiplier = 1.0
                     if "squad" in benchmark:
@@ -118,17 +111,9 @@ class EvalResult:
                         multiplier = 1.0
                     if "throughput" in metric:
                         multiplier = 1.0
-                    if "batch_" in metric or "Mem" in metric or "Util" in metric:
-                        multiplier = 1
                     # print('RESULTS', data['results'])
                     # print('XXX', benchmark, metric, value, multiplier)
-                    if value == "N/A":
-                        results[benchmark][metric] = "-"
-                    elif value == "auto":
-                        results[benchmark][metric] = "auto"
-                    else:
-                        results[benchmark][metric] = value * multiplier
         res = EvalResult(
             eval_name=result_key,
@@ -140,7 +125,6 @@ class EvalResult:
             revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture,
-            model_type=ModelType.from_str(config.get("model_type", "")),
             inference_framework=inference_framework,
         )
@@ -175,22 +159,22 @@ class EvalResult:
         # breakpoint()
         # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
-            # AutoEvalColumn.model_type.name: self.model_type.value.name,
             AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            # AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.dummy.name: self.full_model,
-            # AutoEvalColumn.revision.name: self.revision,
-            # # AutoEvalColumn.average.name: average,
-            # AutoEvalColumn.license.name: self.license,
-            # AutoEvalColumn.likes.name: self.likes,
-            # AutoEvalColumn.params.name: self.num_params,
-            # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
             AutoEvalColumn.inference_framework.name: self.inference_framework,
         }
@@ -278,22 +262,15 @@ def get_raw_eval_results(results_path: str, requests_path: str, is_backend: bool
     eval_results = {}
     for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
-        try:
-            # Creation of result
-            eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
-            eval_result.update_with_request_file(requests_path)
-            # Store results of same eval together
-            eval_name = eval_result.eval_name
-            if eval_name in eval_results.keys():
-                eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
-            else:
-                eval_results[eval_name] = eval_result
-        except (FileNotFoundError, ValueError, KeyError, json.JSONDecodeError) as e:
-            # Log the error and continue with the next file
-            print(f"Error processing file {model_result_filepath}: {e}")
-            continue
     results = []
     for v in eval_results.values():

         if len(org_and_model) == 1:
             org = None
             model = org_and_model[0]
+            result_key = f"{model}_{precision.value.name}"
         else:
             org = org_and_model[0]
             model = org_and_model[1]
+            result_key = f"{org}_{model}_{precision.value.name}"
         full_model = "/".join(org_and_model)
         still_on_hub, error, model_config = is_model_on_hub(
                 if to_add is True:
                     multiplier = 100.0
                     if "rouge" in metric and "truthful" not in benchmark:
                         multiplier = 1.0
                     if "squad" in benchmark:
                         multiplier = 1.0
                     if "throughput" in metric:
                         multiplier = 1.0
                     # print('RESULTS', data['results'])
                     # print('XXX', benchmark, metric, value, multiplier)
+                    results[benchmark][metric] = value * multiplier
         res = EvalResult(
             eval_name=result_key,
             revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
             architecture=architecture,
             inference_framework=inference_framework,
         )
         # breakpoint()
         # average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
+            AutoEvalColumn.model_type.name: self.model_type.value.name,
             AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
+            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+            AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.dummy.name: self.full_model,
+            AutoEvalColumn.revision.name: self.revision,
+            # AutoEvalColumn.average.name: average,
+            AutoEvalColumn.license.name: self.license,
+            AutoEvalColumn.likes.name: self.likes,
+            AutoEvalColumn.params.name: self.num_params,
+            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
             AutoEvalColumn.inference_framework.name: self.inference_framework,
         }
     eval_results = {}
     for model_result_filepath in tqdm(model_result_filepaths, desc="reading model_result_filepaths"):
+        # Creation of result
+        eval_result = EvalResult.init_from_json_file(model_result_filepath, is_backend=is_backend)
+        eval_result.update_with_request_file(requests_path)
+        # Store results of same eval together
+        eval_name = eval_result.eval_name
+        if eval_name in eval_results.keys():
+            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
+        else:
+            eval_results[eval_name] = eval_result
     results = []
     for v in eval_results.values():

src/populate.py CHANGED Viewed

@@ -12,7 +12,7 @@ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, update_
 from src.backend.envs import Tasks as BackendTasks
 from src.display.utils import Tasks
-from src.display.utils import system_metrics_to_name_map, gpu_metrics_to_name_map
 def get_leaderboard_df(
     results_path: str,
@@ -45,7 +45,12 @@ def get_leaderboard_df(
         bm = (task.benchmark, task.metric)
         name_to_bm_map[name] = bm
     all_data_json = []
     for entry in all_data_json_:
@@ -58,9 +63,6 @@ def get_leaderboard_df(
                     if sys_metric in entry[k]:
                         new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
-                for gpu_metric, metric_namne in gpu_metrics_to_name_map.items():
-                    if gpu_metric in entry[k]:
-                        new_entry[f"{k} {metric_namne}"] = entry[k][gpu_metric]
         all_data_json += [new_entry]
     # all_data_json.append(baseline_row)

 from src.backend.envs import Tasks as BackendTasks
 from src.display.utils import Tasks
+from src.display.utils import E2Es, PREs, TS
 def get_leaderboard_df(
     results_path: str,
         bm = (task.benchmark, task.metric)
         name_to_bm_map[name] = bm
+    # bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
+    system_metrics_to_name_map = {
+        "end_to_end_time": f"{E2Es}",
+        "prefilling_time": f"{PREs}",
+        "decoding_throughput": f"{TS}",
+    }
     all_data_json = []
     for entry in all_data_json_:
                     if sys_metric in entry[k]:
                         new_entry[f"{k} {metric_namne}"] = entry[k][sys_metric]
         all_data_json += [new_entry]
     # all_data_json.append(baseline_row)

src/submission/check_validity.py CHANGED Viewed

@@ -74,7 +74,7 @@ def is_model_on_hub(
 def get_model_size(model_info: ModelInfo, precision: str):
-    size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
     except (AttributeError, TypeError):
@@ -130,8 +130,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
                     continue
                 with open(os.path.join(root, file), "r") as f:
                     info = json.load(f)
-                    if not info["status"] == "FINISHED" and not info["status"] == "RUNNING":
-                        file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}_{info['gpu_type']}")
                     # Select organisation
                     if info["model"].count("/") == 0 or "submitted_time" not in info:

 def get_model_size(model_info: ModelInfo, precision: str):
+    size_pattern = size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
     try:
         model_size = round(model_info.safetensors["total"] / 1e9, 3)
     except (AttributeError, TypeError):
                     continue
                 with open(os.path.join(root, file), "r") as f:
                     info = json.load(f)
+                    file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}_{info['inference_framework']}")
                     # Select organisation
                     if info["model"].count("/") == 0 or "submitted_time" not in info:

src/submission/submit.py CHANGED Viewed

@@ -26,8 +26,7 @@ def add_new_eval(
     weight_type: str,
     model_type: str,
     inference_framework: str,
-    debug: bool = False,
-    gpu_type: str = "NVIDIA-A100-PCIe-80GB",
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
@@ -115,18 +114,17 @@ def add_new_eval(
         "params": model_size,
         "license": license,
         "inference_framework": inference_framework,
-        "gpu_type": gpu_type
     }
     # Check for duplicate submission
-    if f"{model}_{revision}_{precision}_{inference_framework}_{gpu_type}" in REQUESTED_MODELS:
         return styled_warning("This model has been already submitted.")
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     # out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}_{inference_framework}_{gpu_type}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))

     weight_type: str,
     model_type: str,
     inference_framework: str,
+    debug: bool = False
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
         "params": model_size,
         "license": license,
         "inference_framework": inference_framework,
     }
     # Check for duplicate submission
+    if f"{model}_{revision}_{precision}_{inference_framework}" in REQUESTED_MODELS:
         return styled_warning("This model has been already submitted.")
     print("Creating eval file")
     OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     # out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
+    out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}_{inference_framework}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))

src/utils.py CHANGED Viewed

@@ -1,56 +1,6 @@
 import pandas as pd
 from huggingface_hub import snapshot_download
-import subprocess
-import re
-import os
-import GPUtil
-try:
-    from src.display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
-except:
-    print("local debug: from display.utils")
-    from display.utils import GPU_TEMP, GPU_Mem, GPU_Power, GPU_Util, GPU_Name
-MEM_BW_DICT ={
-    "NVIDIA-A100-PCIe-80GB": 1935,
-    "NVIDIA-A100-SXM-80GB": 2039,
-    "NVIDIA-H100-PCIe-80GB": 2039,
-    "NVIDIA-RTX-A5000-24GB": 768
-}
-PEAK_FLOPS_DICT = {
-    "float32":{
-        "NVIDIA-A100-PCIe-80GB": 312e12,
-        "NVIDIA-A100-SXM-80GB": 312e12,
-        "NVIDIA-H100-PCIe-80GB": 756e12,
-        "NVIDIA-RTX-A5000-24GB": 222.2e12
-    },
-    "float16":{
-        "NVIDIA-A100-PCIe-80GB": 624e12,
-        "NVIDIA-A100-SXM-80GB": 624e12,
-        "NVIDIA-H100-PCIe-80GB": 1513e12,
-        "NVIDIA-RTX-A5000-24GB": 444.4e12
-    },
-    "bfloat16":{
-        "NVIDIA-A100-PCIe-80GB": 624e12,
-        "NVIDIA-A100-SXM-80GB": 624e12,
-        "NVIDIA-H100-PCIe-80GB": 1513e12,
-        "NVIDIA-RTX-A5000-24GB": 444.4e12
-    },
-    "8bit":{
-        "NVIDIA-A100-PCIe-80GB": 1248e12,
-        "NVIDIA-A100-SXM-80GB": 1248e12,
-        "NVIDIA-H100-PCIe-80GB": 3026e12,
-        "NVIDIA-RTX-A5000-24GB": 889e12
-    },
-    "4bit": {
-        "NVIDIA-A100-PCIe-80GB": 2496e12,
-        "NVIDIA-A100-SXM-80GB": 2496e12,
-        "NVIDIA-H100-PCIe-80GB": 6052e12,
-        "NVIDIA-RTX-A5000-24GB": 1778e12
-    }
-}
 def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
     for i in range(10):
@@ -82,130 +32,3 @@ def get_dataset_summary_table(file_path):
     df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
     return df
-def parse_nvidia_smi():
-    visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', None)
-    if visible_devices is not None:
-        gpu_indices = visible_devices.split(',')
-    else:
-        # Query all GPU indices if CUDA_VISIBLE_DEVICES is not set
-        result = subprocess.run(['nvidia-smi', '--query-gpu=index', '--format=csv,noheader'], capture_output=True, text=True)
-        if result.returncode != 0:
-            print("Failed to query GPU indices.")
-            return []
-        gpu_indices = result.stdout.strip().split('\n')
-    # print(f"gpu_indices: {gpu_indices}")
-    gpu_stats = []
-    gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
-    # gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
-    gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
-    gpu_name = ""
-    for index in gpu_indices:
-        result = subprocess.run(['nvidia-smi', '-i', index], capture_output=True, text=True)
-        output = result.stdout.strip()
-        lines = output.split("\n")
-        for line in lines:
-            match = gpu_info_pattern.search(line)
-            name_match = gpu_name_pattern.search(line)
-            gpu_info = {}
-            if name_match:
-                gpu_name = ''.join(filter(None, name_match.groups())).strip()
-            if match:
-                temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
-                gpu_info.update({
-                    GPU_TEMP: temp,
-                    GPU_Power: power_usage,
-                    GPU_Mem: round(mem_usage / 1024, 2),
-                    GPU_Util: gpu_util
-                })
-            if len(gpu_info) >= 4:
-                gpu_stats.append(gpu_info)
-    # print(f"gpu_stats: {gpu_stats}")
-    gpu_name = f"{len(gpu_stats)}x{gpu_name}"
-    gpu_stats_total = {
-                        GPU_TEMP: 0,
-                        GPU_Power: 0,
-                        GPU_Mem: 0,
-                        GPU_Util: 0,
-                        GPU_Name: gpu_name
-                    }
-    for gpu_stat in gpu_stats:
-        gpu_stats_total[GPU_TEMP] += gpu_stat[GPU_TEMP]
-        gpu_stats_total[GPU_Power] += gpu_stat[GPU_Power]
-        gpu_stats_total[GPU_Mem] += gpu_stat[GPU_Mem]
-        gpu_stats_total[GPU_Util] += gpu_stat[GPU_Util]
-    gpu_stats_total[GPU_Mem] = gpu_stats_total[GPU_Mem] # G
-    gpu_stats_total[GPU_TEMP] /= len(gpu_stats)
-    gpu_stats_total[GPU_Power] /= len(gpu_stats)
-    gpu_stats_total[GPU_Util] /= len(gpu_stats)
-    return [gpu_stats_total]
-def monitor_gpus(stop_event, interval, stats_list):
-    while not stop_event.is_set():
-        gpu_stats = parse_nvidia_smi()
-        if gpu_stats:
-            stats_list.extend(gpu_stats)
-        stop_event.wait(interval)
-def analyze_gpu_stats(stats_list):
-    # Check if the stats_list is empty, and return None if it is
-    if not stats_list:
-        return None
-    # Initialize dictionaries to store the stats
-    avg_stats = {}
-    max_stats = {}
-    # Calculate average stats, excluding 'GPU_Mem'
-    for key in stats_list[0].keys():
-        if key != GPU_Mem and key != GPU_Name:
-            total = sum(d[key] for d in stats_list)
-            avg_stats[key] = total / len(stats_list)
-    # Calculate max stats for 'GPU_Mem'
-    max_stats[GPU_Mem] = max(d[GPU_Mem] for d in stats_list)
-    if GPU_Name in stats_list[0]:
-        avg_stats[GPU_Name] = stats_list[0][GPU_Name]
-    # Update average stats with max GPU memory usage
-    avg_stats.update(max_stats)
-    return avg_stats
-def get_gpu_details():
-    gpus = GPUtil.getGPUs()
-    gpu = gpus[0]
-    name = gpu.name.replace(" ", "-")
-    memory_gb = round(gpu.memoryTotal / 1024)
-    memory = f"{memory_gb}GB"
-    for part in name.split('-'):
-        if part.endswith("GB") and part[:-2].isdigit():
-            name = name.replace(f"-{part}", "").replace(part, "")
-    formatted_name = f"{name}-{memory}"
-    return formatted_name
-def get_peak_bw(gpu_name):
-    return MEM_BW_DICT[gpu_name]
-def get_peak_flops(gpu_name, precision):
-    return PEAK_FLOPS_DICT[precision][gpu_name]
-def transfer_precision2bytes(precision):
-    if precision == "float32":
-        return 4
-    elif precision in ["float16", "bfloat16"]:
-        return 2
-    elif precision == "8bit":
-        return 1
-    elif precision == "4bit":
-        return 0.5
-    else:
-        raise ValueError(f"Unsupported precision: {precision}")
-if __name__ == "__main__":
-    print(analyze_gpu_stats(parse_nvidia_smi()))

 import pandas as pd
 from huggingface_hub import snapshot_download
 def my_snapshot_download(repo_id, revision, local_dir, repo_type, max_workers):
     for i in range(10):
     df = df[["Category", "Benchmark", "Data Split", "Data Size", "Language"]]
     return df