Spaces:

PKU-Alignment
/

ProgressGym-LeaderBoard

Runtime error

App Files Files Community

Tianyi (Alex) Qiu commited on Jul 2, 2024

Commit

24a3e20

1 Parent(s): 487d79e

customize tasks

Browse files

Files changed (10) hide show

.gitignore +5 -0
app.py +5 -19
src/about.py +5 -2
src/display/utils.py +4 -1
src/envs.py +2 -0
src/leaderboard/read_evals.py +9 -2
src/legacy/app.py +0 -331
src/legacy/submit.py +0 -119
src/populate.py +3 -0
src/submission/submit.py +3 -2

.gitignore CHANGED Viewed

@@ -11,3 +11,8 @@ eval-results/
 eval-queue-bk/
 eval-results-bk/
 logs/

 eval-queue-bk/
 eval-results-bk/
 logs/
+demo-leaderboard/
+results/
+upload_history/
+master_table.json

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ from src.display.utils import (
     WeightType,
     Precision
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DATA_REPO, REPO_ID, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
@@ -35,31 +35,17 @@ def restart_space():
     API.restart_space(repo_id=REPO_ID)
 try:
-    print(EVAL_REQUESTS_PATH)
     snapshot_download(
-        repo_id=DATA_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=DATA_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
 raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 leaderboard_df = original_df.copy()
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 # Searching and filtering
 def update_table(
     hidden_df: pd.DataFrame,
@@ -75,7 +61,7 @@ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
 def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     always_here_cols = [
-        AutoEvalColumn.model_type_symbol.name,
         AutoEvalColumn.model.name,
     ]
     # We use COLS to maintain sorting

     WeightType,
     Precision
 )
+from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DATA_REPO, REPO_ID, TOKEN, REQUESTS_REPO_PATH, RESULTS_REPO_PATH, CACHE_PATH
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 from src.submission.submit import add_new_eval
     API.restart_space(repo_id=REPO_ID)
 try:
+    print(CACHE_PATH)
     snapshot_download(
+        repo_id=DATA_REPO, local_dir=CACHE_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
+    print("Could not download the dataset. Please check your token and network connection.")
     restart_space()
 raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 leaderboard_df = original_df.copy()
 # Searching and filtering
 def update_table(
     hidden_df: pd.DataFrame,
 def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     always_here_cols = [
+        # AutoEvalColumn.model_type_symbol.name,
         AutoEvalColumn.model.name,
     ]
     # We use COLS to maintain sorting

src/about.py CHANGED Viewed

@@ -12,8 +12,11 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------

 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    # task0 = Task("anli_r1", "acc", "ANLI")
+    # task1 = Task("logiqa", "acc_norm", "LogiQA")
+    task0 = Task("Follow", "accuracy", "Follow")
+    task1 = Task("Predict", "accuracy", "Predict")
+    task2 = Task("Coevolve", "accuracy", "Coevolve")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------

src/display/utils.py CHANGED Viewed

@@ -22,13 +22,16 @@ class ColumnContent:
 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])

 ## Leaderboard columns
 auto_eval_column_dict = []
 # Init
+# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
 auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
 auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])

src/envs.py CHANGED Viewed

@@ -11,6 +11,8 @@ OWNER = "PKU-Alignment" # Change to your org - don't forget to create a results
 REPO_ID = f"{OWNER}/ProgressGym-LeaderBoard"
 DATA_REPO = f"{OWNER}/ProgressGym-LeaderBoardData"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")

 REPO_ID = f"{OWNER}/ProgressGym-LeaderBoard"
 DATA_REPO = f"{OWNER}/ProgressGym-LeaderBoardData"
+RESULTS_REPO_PATH = 'eval-results/'
+REQUESTS_REPO_PATH = 'eval-queue/'
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -114,7 +114,7 @@ class EvalResult:
             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
             AutoEvalColumn.model_type.name: self.model_type.value.name,
-            AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
             AutoEvalColumn.weight_type.name: self.weight_type.value.name,
             AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
@@ -127,7 +127,10 @@ class EvalResult:
         }
         for task in Tasks:
-            data_dict[task.value.col_name] = self.results[task.value.benchmark]
         return data_dict
@@ -158,6 +161,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
@@ -172,11 +176,13 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
         eval_name = eval_result.eval_name
@@ -191,6 +197,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
             v.to_dict() # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue
     return results

             "eval_name": self.eval_name,  # not a column, just a save name,
             AutoEvalColumn.precision.name: self.precision.value.name,
             AutoEvalColumn.model_type.name: self.model_type.value.name,
+            # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
             AutoEvalColumn.weight_type.name: self.weight_type.value.name,
             AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
         }
         for task in Tasks:
+            try:
+                data_dict[task.value.col_name] = self.results[task.value.benchmark]
+            except:
+                data_dict[task.value.col_name] = 0
         return data_dict
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
+    print(f"Reading results from {results_path}")
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
+    print(f"Found these files: {model_result_filepaths}")
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
+        print(f"Found result for {eval_result.full_model} with precision {eval_result.precision.value.name}")
         # Store results of same eval together
         eval_name = eval_result.eval_name
             v.to_dict() # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
+            print(f"Skipping {v.full_model} as not all values are present ({v.to_dict()})")
             continue
     return results

src/legacy/app.py DELETED Viewed

@@ -1,331 +0,0 @@
-import subprocess
-import gradio as gr
-import pandas as pd
-from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import snapshot_download
-from src.about import (
-    CITATION_BUTTON_LABEL,
-    CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
-    INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
-    TITLE,
-)
-from src.display.css_html_js import custom_css
-from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
-    NUMERIC_INTERVALS,
-    TYPES,
-    AutoEvalColumn,
-    ModelType,
-    fields,
-    WeightType,
-    Precision
-)
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
-def restart_space():
-    API.restart_space(repo_id=REPO_ID)
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
-    snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
-raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-leaderboard_df = original_df.copy()
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-# Searching and filtering
-def update_table(
-    hidden_df: pd.DataFrame,
-    columns: list,
-    type_query: list,
-    precision_query: str,
-    size_query: list,
-    show_deleted: bool,
-    query: str,
-):
-    filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
-    filtered_df = filter_queries(query, filtered_df)
-    df = select_columns(filtered_df, columns)
-    return df
-def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
-    return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
-def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
-    always_here_cols = [
-        AutoEvalColumn.model_type_symbol.name,
-        AutoEvalColumn.model.name,
-    ]
-    # We use COLS to maintain sorting
-    filtered_df = df[
-        always_here_cols + [c for c in COLS if c in df.columns and c in columns]
-    ]
-    return filtered_df
-def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
-    # final_df = []
-    # if query != "":
-    #     queries = [q.strip() for q in query.split(";")]
-    #     for _q in queries:
-    #         _q = _q.strip()
-    #         if _q != "":
-    #             temp_filtered_df = search_table(filtered_df, _q)
-    #             if len(temp_filtered_df) > 0:
-    #                 final_df.append(temp_filtered_df)
-    #     if len(final_df) > 0:
-    #         filtered_df = pd.concat(final_df)
-    #         filtered_df = filtered_df.drop_duplicates(
-    #             subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
-    #         )
-    return filtered_df
-def filter_models(
-    df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
-) -> pd.DataFrame:
-    # Show all models
-    return df
-demo = gr.Blocks(css=custom_css)
-with demo:
-    gr.HTML(TITLE)
-    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
-            with gr.Row():
-                with gr.Column():
-                    # with gr.Row():
-                    #     search_bar = gr.Textbox(
-                    #         placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
-                    #         show_label=False,
-                    #         elem_id="search-bar",
-                    #     )
-                    with gr.Row():
-                        shown_columns = gr.CheckboxGroup(
-                            choices=[
-                                c.name
-                                for c in fields(AutoEvalColumn)
-                                if not c.hidden and not c.never_hidden
-                            ],
-                            value=[
-                                c.name
-                                for c in fields(AutoEvalColumn)
-                                if c.displayed_by_default and not c.hidden and not c.never_hidden
-                            ],
-                            label="Select columns to show",
-                            elem_id="column-select",
-                            interactive=True,
-                        )
-                    # with gr.Row():
-                    #     deleted_models_visibility = gr.Checkbox(
-                    #         value=False, label="Show gated/private/deleted models", interactive=True
-                    #     )
-                # with gr.Column(min_width=320):
-                #     #with gr.Box(elem_id="box-filter"):
-                #     filter_columns_type = gr.CheckboxGroup(
-                #         label="Model types",
-                #         choices=[t.to_str() for t in ModelType],
-                #         value=[t.to_str() for t in ModelType],
-                #         interactive=True,
-                #         elem_id="filter-columns-type",
-                #     )
-                #     filter_columns_precision = gr.CheckboxGroup(
-                #         label="Precision",
-                #         choices=[i.value.name for i in Precision],
-                #         value=[i.value.name for i in Precision],
-                #         interactive=True,
-                #         elem_id="filter-columns-precision",
-                #     )
-                #     filter_columns_size = gr.CheckboxGroup(
-                #         label="Model sizes (in billions of parameters)",
-                #         choices=list(NUMERIC_INTERVALS.keys()),
-                #         value=list(NUMERIC_INTERVALS.keys()),
-                #         interactive=True,
-                #         elem_id="filter-columns-size",
-                #     )
-            leaderboard_table = gr.components.Dataframe(
-                value=leaderboard_df[
-                    [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
-                    + shown_columns.value
-                ],
-                headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
-                datatype=TYPES,
-                elem_id="leaderboard-table",
-                interactive=False,
-                visible=True,
-            )
-            # Dummy leaderboard for handling the case when the user uses backspace key
-            hidden_leaderboard_table_for_search = gr.components.Dataframe(
-                value=original_df[COLS],
-                headers=COLS,
-                datatype=TYPES,
-                visible=False,
-            )
-            # search_bar.submit(
-            #     update_table,
-            #     [
-            #         hidden_leaderboard_table_for_search,
-            #         shown_columns,
-            #         filter_columns_type,
-            #         filter_columns_precision,
-            #         filter_columns_size,
-            #         deleted_models_visibility,
-            #         search_bar,
-            #     ],
-            #     leaderboard_table,
-            # )
-            for selector in [shown_columns]: # removed: filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility
-                selector.change(
-                    update_table,
-                    [
-                        hidden_leaderboard_table_for_search,
-                        shown_columns,
-                        # filter_columns_type,
-                        # filter_columns_precision,
-                        # filter_columns_size,
-                        # deleted_models_visibility,
-                        # search_bar,
-                    ],
-                    leaderboard_table,
-                    queue=True,
-                )
-        with gr.TabItem("About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
-    with gr.Row():
-        with gr.Accordion("Citation", open=False):
-            citation_button = gr.Textbox(
-                value=CITATION_BUTTON_TEXT,
-                label=CITATION_BUTTON_LABEL,
-                lines=20,
-                elem_id="citation-button",
-                show_copy_button=True,
-            )
-scheduler = BackgroundScheduler()
-scheduler.add_job(restart_space, "interval", seconds=1800)
-scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

src/legacy/submit.py DELETED Viewed

@@ -1,119 +0,0 @@
-import json
-import os
-from datetime import datetime, timezone
-from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
-from src.submission.check_validity import (
-    already_submitted_models,
-    check_model_card,
-    get_model_size,
-    is_model_on_hub,
-)
-REQUESTED_MODELS = None
-USERS_TO_SUBMISSION_DATES = None
-def add_new_eval(
-    model: str,
-    base_model: str,
-    revision: str,
-    precision: str,
-    weight_type: str,
-    model_type: str,
-):
-    global REQUESTED_MODELS
-    global USERS_TO_SUBMISSION_DATES
-    if not REQUESTED_MODELS:
-        REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
-    user_name = ""
-    model_path = model
-    if "/" in model:
-        user_name = model.split("/")[0]
-        model_path = model.split("/")[1]
-    precision = precision.split(" ")[0]
-    current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
-    if model_type is None or model_type == "":
-        return styled_error("Please select a model type.")
-    # Does the model actually exist?
-    if revision == "":
-        revision = "main"
-    # Is the model on the hub?
-    if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not base_model_on_hub:
-            return styled_error(f'Base model "{base_model}" {error}')
-    if not weight_type == "Adapter":
-        model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
-        if not model_on_hub:
-            return styled_error(f'Model "{model}" {error}')
-    # Is the model info correctly filled?
-    try:
-        model_info = API.model_info(repo_id=model, revision=revision)
-    except Exception:
-        return styled_error("Could not get your model information. Please fill it up properly.")
-    model_size = get_model_size(model_info=model_info, precision=precision)
-    # Were the model card and license filled?
-    try:
-        license = model_info.cardData["license"]
-    except Exception:
-        return styled_error("Please select a license for your model")
-    modelcard_OK, error_msg = check_model_card(model)
-    if not modelcard_OK:
-        return styled_error(error_msg)
-    # Seems good, creating the eval
-    print("Adding new eval")
-    eval_entry = {
-        "model": model,
-        "base_model": base_model,
-        "revision": revision,
-        "precision": precision,
-        "weight_type": weight_type,
-        "status": "PENDING",
-        "submitted_time": current_time,
-        "model_type": model_type,
-        "likes": model_info.likes,
-        "params": model_size,
-        "license": license,
-        "private": False,
-    }
-    # Check for duplicate submission
-    if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
-        return styled_warning("This model has been already submitted.")
-    print("Creating eval file")
-    OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
-    os.makedirs(OUT_DIR, exist_ok=True)
-    out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
-    with open(out_path, "w") as f:
-        f.write(json.dumps(eval_entry))
-    print("Uploading eval file")
-    API.upload_file(
-        path_or_fileobj=out_path,
-        path_in_repo=out_path.split("eval-queue/")[1],
-        repo_id=QUEUE_REPO,
-        repo_type="dataset",
-        commit_message=f"Add {model} to eval queue",
-    )
-    # Remove the local file
-    os.remove(out_path)
-    return styled_message(
-        "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
-    )

src/populate.py CHANGED Viewed

@@ -11,9 +11,11 @@ from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
@@ -40,6 +42,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
         elif ".md" not in entry:
             # this is a folder
             sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
             for sub_entry in sub_entries:
                 file_path = os.path.join(save_path, entry, sub_entry)
                 with open(file_path) as fp:

 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
+    print(raw_data)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
+    print(df, AutoEvalColumn.average.name)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
         elif ".md" not in entry:
             # this is a folder
             sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
+            print(sub_entries)
             for sub_entry in sub_entries:
                 file_path = os.path.join(save_path, entry, sub_entry)
                 with open(file_path) as fp:

src/submission/submit.py CHANGED Viewed

@@ -51,7 +51,8 @@ def add_new_eval(
     # parse the submission file
     try:
-        submission_data = json.loads(file_path)
     except JSONDecodeError:
         return styled_error("Invalid submission file: JSON parsing failed.")
@@ -90,7 +91,7 @@ def add_new_eval(
         "update_timestamp": timestamp_filename,
     }
-    for challenge, result in results_per_challenge:
         try:
             parsed_result: float = parse_challenge_result_dict(challenge, result)
             assert isinstance(parsed_result, float)

     # parse the submission file
     try:
+        with open(file_path, "r") as f:
+            submission_data = json.load(f)
     except JSONDecodeError:
         return styled_error("Invalid submission file: JSON parsing failed.")
         "update_timestamp": timestamp_filename,
     }
+    for challenge, result in results_per_challenge.items():
         try:
             parsed_result: float = parse_challenge_result_dict(challenge, result)
             assert isinstance(parsed_result, float)