Spaces:

PKU-Alignment
/

ProgressGym-LeaderBoard

Runtime error

App Files Files Community

Tianyi (Alex) Qiu commited on Jul 3, 2024

Commit

139f14b

1 Parent(s): 24a3e20

finish framework (esp. submit challenge & encrypt)

Browse files

Files changed (10) hide show

.gitignore +4 -1
app.py +57 -14
pubkey.pem +13 -0
src/about.py +53 -34
src/display/utils.py +41 -36
src/envs.py +6 -0
src/leaderboard/read_evals.py +27 -27
src/populate.py +46 -7
src/submission/submit.py +102 -4
tempCodeRunnerFile.python +17 -0

.gitignore CHANGED Viewed

@@ -15,4 +15,7 @@ logs/
 demo-leaderboard/
 results/
 upload_history/
-master_table.json

 demo-leaderboard/
 results/
 upload_history/
+master_table.json
+priv*
+tmp*

app.py CHANGED Viewed

@@ -11,24 +11,27 @@ from src.about import (
     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
     BENCHMARK_COLS,
     COLS,
     EVAL_COLS,
     EVAL_TYPES,
     NUMERIC_INTERVALS,
     TYPES,
     AutoEvalColumn,
-    ModelType,
     fields,
     WeightType,
     Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DATA_REPO, REPO_ID, TOKEN, REQUESTS_REPO_PATH, RESULTS_REPO_PATH, CACHE_PATH
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
 def restart_space():
@@ -43,7 +46,7 @@ except Exception:
     print("Could not download the dataset. Please check your token and network connection.")
     restart_space()
-raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 leaderboard_df = original_df.copy()
 # Searching and filtering
@@ -52,6 +55,12 @@ def update_table(
     columns: list,
 ):
     df = select_columns(hidden_df, columns)
     return df
@@ -68,6 +77,9 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
     filtered_df = df[
         always_here_cols + [c for c in COLS if c in df.columns and c in columns]
     ]
     return filtered_df
 demo = gr.Blocks(css=custom_css)
@@ -124,23 +136,22 @@ with demo:
                     queue=True,
                 )
-        with gr.TabItem("Submit Algorithm", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
             with gr.Row():
-                gr.Markdown("# Submission Form\nSubmitted files will be stored and made public.", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
-                    submission_file = gr.File(label="Evaluation result (JSON file generated by `run_benchmark.py`, one algorithm on all challenges)")
                 with gr.Column():
                     algo_name = gr.Textbox(label="Algorithm display name")
-                    algo_info = gr.Textbox(label="Comments & extra information")
-                    algo_link = gr.Textbox(label="One external link (e.g. GitHub repo, paper, project page)")
-                    submitter_email = gr.Textbox(label="Email address for contact (will be kept confidential)")
             submit_button = gr.Button("Submit Algorithm")
             submission_result = gr.Markdown()
@@ -155,9 +166,41 @@ with demo:
                 ],
                 submission_result,
             )
     with gr.Row():
-        with gr.Accordion("About & Citation", open=False):
             citation_button = gr.Textbox(
                 value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,

     INTRODUCTION_TEXT,
     LLM_BENCHMARKS_TEXT,
     TITLE,
+    ABOUT_TEXT,
+    SUBMIT_CHALLENGE_TEXT,
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
     BENCHMARK_COLS,
     COLS,
+    COLS_PAIRED,
     EVAL_COLS,
     EVAL_TYPES,
     NUMERIC_INTERVALS,
     TYPES,
     AutoEvalColumn,
+    AlgoType,
     fields,
     WeightType,
     Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DATA_REPO, REPO_ID, TOKEN, REQUESTS_REPO_PATH, RESULTS_REPO_PATH, CACHE_PATH
+from src.populate import get_evaluation_queue_df, get_leaderboard_df, calc_average
+from src.submission.submit import add_new_eval, add_new_challenge
 def restart_space():
     print("Could not download the dataset. Please check your token and network connection.")
     restart_space()
+original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS, COLS_PAIRED)
 leaderboard_df = original_df.copy()
 # Searching and filtering
     columns: list,
 ):
     df = select_columns(hidden_df, columns)
+    if AutoEvalColumn.average.name in df.columns:
+        df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
+        df[[AutoEvalColumn.average.name]] = df[[AutoEvalColumn.average.name]].round(decimals=4)
+    elif AutoEvalColumn.model.name in df.columns:
+        df = df.sort_values(by=[AutoEvalColumn.model.name], ascending=True)
     return df
     filtered_df = df[
         always_here_cols + [c for c in COLS if c in df.columns and c in columns]
     ]
+    if AutoEvalColumn.average.name in filtered_df.columns:
+        filtered_df[AutoEvalColumn.average.name] = filtered_df.apply(lambda row: calc_average(row, [col[0] for col in BENCHMARK_COLS]), axis=1)
     return filtered_df
 demo = gr.Blocks(css=custom_css)
                     queue=True,
                 )
+        with gr.TabItem("Submit Algorithm", elem_id="llm-benchmark-tab-table", id=1):
+            with gr.Row():
+                gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
             with gr.Row():
+                gr.Markdown("# Submission Form\nSubmitted files will be stored and made public. If you have any questions, please [contact](mailto:qiutianyi.qty@gmail.com) the ProgressGym team.", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
+                    submission_file = gr.File(label="Evaluation result (JSON file generated by run_benchmark.py, one algorithm on all challenges)", file_types=['.json'])
                 with gr.Column():
                     algo_name = gr.Textbox(label="Algorithm display name")
+                    algo_info = gr.Textbox(label="Optional: Comments & extra information")
+                    algo_link = gr.Textbox(label="Optional: One external link (e.g. GitHub repo, paper, project page)")
+                    submitter_email = gr.Textbox(label="Optional: Email address for contact (will be encrypted with RSA-2048 for privacy before storage and public archiving)")
             submit_button = gr.Button("Submit Algorithm")
             submission_result = gr.Markdown()
                 ],
                 submission_result,
             )
+        with gr.TabItem("Submit Challenge", elem_id="llm-benchmark-tab-table", id=2):
+            with gr.Row():
+                gr.Markdown(SUBMIT_CHALLENGE_TEXT, elem_classes="markdown-text")
+            with gr.Row():
+                gr.Markdown("# Submission Form\nSubmitted files will be stored and made public. If you have any questions, please [contact](mailto:qiutianyi.qty@gmail.com) the ProgressGym team.", elem_classes="markdown-text")
+            with gr.Row():
+                with gr.Column():
+                    challenge_submission_file = gr.File(label="Optional: Evaluation results (JSON file(s) generated by run_benchmark.py, testing all algorithms on your challenge)", file_count='multiple', file_types=['.json'])
+                with gr.Column():
+                    challenge_name = gr.Textbox(label="Challenge display name")
+                    challenge_info = gr.Textbox(label="Comments & extra information", lines=3)
+                    challenge_link = gr.Textbox(label="One external link (e.g. GitHub repo, paper, project page)")
+                    challenge_submitter_email = gr.Textbox(label="Email address for contact (will be encrypted with RSA-2048 for privacy before storage and public archiving)")
+            challenge_submit_button = gr.Button("Submit Challenge")
+            challenge_submission_result = gr.Markdown()
+            challenge_submit_button.click(
+                add_new_challenge,
+                [
+                    challenge_submission_file,
+                    challenge_name,
+                    challenge_info,
+                    challenge_link,
+                    challenge_submitter_email,
+                ],
+                challenge_submission_result,
+            )
     with gr.Row():
+        with gr.Accordion("About & Citation 📖", open=False):
+            about_text = gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
             citation_button = gr.Textbox(
                 value=CITATION_BUTTON_TEXT,
                 label=CITATION_BUTTON_LABEL,

pubkey.pem ADDED Viewed

	@@ -0,0 +1,13 @@

+-----BEGIN RSA PUBLIC KEY-----
+MIICCgKCAgEAgEI1f9767WuoW0spjJYIXDdOrTrByOPc0cx/C5jeTKMQMxy11RrT
+DDnZQl226AV68t6Wf86Z52XEFrkgUXRXlCfXCCM+kRxNkxQwA0faxlR2NI1576tu
+OQBp6LL+Vmr2g18JE4kHo9T7TIkOmAiy21Vo/YILJgQ9cimgDu2KN3F9hqBhdNaO
+4n6L2g2TcActNt/ECtwVahtIrqZd8MZV0lYe1ieHR/d4KiPsnZ6FULpOG+ynqX4k
+SzcD3kgiWb9QYF0GHV/pUkpUVmAkjyA0BGe360/au+NJ6oxHrLVA5ephvwBt9st6
+P3xWvucuP30/YcQECGOz48DnEDKfXZ+4mYfMS9mUEEcP7qjoTN+wOGCRV8Z48U4X
+KIP3r5NuT3+qY1uSO14fv9Uu9VvJgaySBBrAEk7n0Wf4ywL/5nt8C9/bivZIiDT+
+88cSKUy6mnHutB23pwy7UK/7jR+NH6h90zdJDHBnnQp8UEct1bsJxqUOTw3uTidc
+Q3w4iRwXC3/A1Y7LD31qZ2AK7AkibTOJX8lXih5fbk/21PUEVqj3XBsOVuW3E1KO
+iXCQ96SS+gk927gg/F79PKokXPup8+0t0xP8la1fZA0DENyyhflS3T/VDPYZ2rBd
+VFD6Xf5UPC/NaAdONaLemEBYYMirXAmIs1l14SIdrtQPI72hdlxVi5UCAwEAAQ==
+-----END RSA PUBLIC KEY-----

src/about.py CHANGED Viewed

@@ -14,9 +14,9 @@ class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     # task0 = Task("anli_r1", "acc", "ANLI")
     # task1 = Task("logiqa", "acc_norm", "LogiQA")
-    task0 = Task("Follow", "accuracy", "Follow")
-    task1 = Task("Predict", "accuracy", "Predict")
-    task2 = Task("Coevolve", "accuracy", "Coevolve")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
@@ -24,52 +24,71 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
-"""
-EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
-CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
-CITATION_BUTTON_TEXT = r"""
 """

     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     # task0 = Task("anli_r1", "acc", "ANLI")
     # task1 = Task("logiqa", "acc_norm", "LogiQA")
+    task0 = Task("Follow", "accuracy", "Follow ↑")
+    task1 = Task("Predict", "accuracy", "Predict ↑")
+    task2 = Task("Coevolve", "accuracy", "Coevolve ↑")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">ProgressGym Leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+Human values are evolving and have undergone huge, continual progress over the past millennium. Values embedded into the LLMs need to undergo the same process, or else we risk *locking-in* current human values by putting humans into an echo chamber of like-minded LLMs. This concern is especially salient when LLMs have become personal assistants, romantic partners, K-12 educators, etc., and [psychological studies](https://arxiv.org/abs/2302.00560) have demonstrated very significant impact of LLMs on human views.
+ProgressGym-LeaderBoard is an open leaderboard for *progress alignment* algorithms - algorithms which learn and emulate the mechanics of moral progress, in order to facilitate continual improvements in real-world value decisions. Refer to the [ProgressGym paper](https://arxiv.org/abs/2406.20087) for more details.
 """
 # Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = f""""""
+EVALUATION_QUEUE_TEXT = """
+## Steps to submit your progress alignment algorithm
+To submit your progress alignment algorithm, please follow the following steps.
+#### Step 1: Clone the ProgressGym codebase
+[More explanation coming]
+#### Step 2: Implement your progress alignment algorithm as an `Examinee` class
+[More explanation coming]
+#### Step 3: Run the benchmark script on all challenges
+[More explanation coming]
+#### Step 4: Submit the generated results as a JSON file
+[More explanation coming]
 """
+ABOUT_TEXT = """ProgressGym-LeaderBoard is tightly coupled with the ProgressGym experimental framework for progress alignment research, which provides historical datasets, historical LLMs, simulation environments, algorithm implementations, and benchmarks on progress alignment challenges. Please refer to the [ProgressGym paper](https://arxiv.org/abs/2406.20087) for more details."""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite the project"
+CITATION_BUTTON_TEXT = r"""@article{progressgym,
+  title={ProgressGym: Alignment with a Millennium of Moral Progress},
+  author={Tianyi Qiu and Yang Zhang and Xuchuan Huang and Jasmine Xinze Li and Jiaming Ji and Yaodong Yang},
+  journal={arXiv preprint arXiv:2406.20087},
+  eprint={2406.20087},
+  eprinttype = {arXiv},
+  year={2024}
+}
 """
+SUBMIT_CHALLENGE_TEXT = """
+## Steps to submit your progress alignment challenge
+To submit your progress alignment challenge, please follow the following steps.
+#### Step 1: Clone the ProgressGym codebase
+[More explanation coming]
+#### Step 2: Implement your progress alignment challenge as a `Judge` class
+[More explanation coming]
+#### Step 3 (optional but recommended): Run the benchmark script on all challenges
+[More explanation coming]
+#### Step 4: Submit the link to your codebase and (optional but recommended) the generated results as a JSON file
+[More explanation coming]
+"""

src/display/utils.py CHANGED Viewed

@@ -8,6 +8,8 @@ from src.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
@@ -25,23 +27,25 @@ auto_eval_column_dict = []
 # Init
 # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
-auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 #Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
-# Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
-auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
-auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
-auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -58,18 +62,18 @@ class EvalQueueColumn:  # Queue column
 ## All the model information that we might need
 @dataclass
-class ModelDetails:
     name: str
     display_name: str = ""
     symbol: str = "" # emoji
-class ModelType(Enum):
-    PT = ModelDetails(name="pretrained", symbol="🟢")
-    FT = ModelDetails(name="fine-tuned", symbol="🔶")
-    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
-    RL = ModelDetails(name="RL-tuned", symbol="🟦")
-    Unknown = ModelDetails(name="", symbol="?")
     def to_str(self, separator=" "):
         return f"{self.value.symbol}{separator}{self.value.name}"
@@ -77,28 +81,28 @@ class ModelType(Enum):
     @staticmethod
     def from_str(type):
         if "fine-tuned" in type or "🔶" in type:
-            return ModelType.FT
         if "pretrained" in type or "🟢" in type:
-            return ModelType.PT
         if "RL-tuned" in type or "🟦" in type:
-            return ModelType.RL
         if "instruction-tuned" in type or "⭕" in type:
-            return ModelType.IFT
-        return ModelType.Unknown
 class WeightType(Enum):
-    Adapter = ModelDetails("Adapter")
-    Original = ModelDetails("Original")
-    Delta = ModelDetails("Delta")
 class Precision(Enum):
-    float16 = ModelDetails("float16")
-    bfloat16 = ModelDetails("bfloat16")
-    float32 = ModelDetails("float32")
-    #qt_8bit = ModelDetails("8bit")
-    #qt_4bit = ModelDetails("4bit")
-    #qt_GPTQ = ModelDetails("GPTQ")
-    Unknown = ModelDetails("?")
     def from_str(precision):
         if precision in ["torch.float16", "float16"]:
@@ -124,7 +128,8 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),
@@ -135,4 +140,4 @@ NUMERIC_INTERVALS = {
     "~35": pd.Interval(20, 45, closed="right"),
     "~60": pd.Interval(45, 70, closed="right"),
     "70+": pd.Interval(70, 10000, closed="right"),
-}

 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+def fields_paired(raw_class):
+    return [(k,v) for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 # These classes are for user facing column names,
 # to avoid having to change them all around the code
 # Init
 # auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Algorithm", "markdown", True, never_hidden=True)])
 #Scores
+auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ↑", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+# Algorithm information
+auto_eval_column_dict.append(["info", ColumnContent, ColumnContent("Info", "str", True)])
+auto_eval_column_dict.append(["update_timestamp", ColumnContent, ColumnContent("Update timestamp", "str", False)])
+# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Algorithm sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
 AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 ## All the model information that we might need
 @dataclass
+class AlgoDetails:
     name: str
     display_name: str = ""
     symbol: str = "" # emoji
+class AlgoType(Enum):
+    PT = AlgoDetails(name="pretrained", symbol="🟢")
+    FT = AlgoDetails(name="fine-tuned", symbol="🔶")
+    IFT = AlgoDetails(name="instruction-tuned", symbol="⭕")
+    RL = AlgoDetails(name="RL-tuned", symbol="🟦")
+    Unknown = AlgoDetails(name="", symbol="?")
     def to_str(self, separator=" "):
         return f"{self.value.symbol}{separator}{self.value.name}"
     @staticmethod
     def from_str(type):
         if "fine-tuned" in type or "🔶" in type:
+            return AlgoType.FT
         if "pretrained" in type or "🟢" in type:
+            return AlgoType.PT
         if "RL-tuned" in type or "🟦" in type:
+            return AlgoType.RL
         if "instruction-tuned" in type or "⭕" in type:
+            return AlgoType.IFT
+        return AlgoType.Unknown
 class WeightType(Enum):
+    Adapter = AlgoDetails("Adapter")
+    Original = AlgoDetails("Original")
+    Delta = AlgoDetails("Delta")
 class Precision(Enum):
+    float16 = AlgoDetails("float16")
+    bfloat16 = AlgoDetails("bfloat16")
+    float32 = AlgoDetails("float32")
+    #qt_8bit = AlgoDetails("8bit")
+    #qt_4bit = AlgoDetails("4bit")
+    #qt_GPTQ = AlgoDetails("GPTQ")
+    Unknown = AlgoDetails("?")
     def from_str(precision):
         if precision in ["torch.float16", "float16"]:
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [(t.value.col_name, t.value.benchmark) for t in Tasks]
+COLS_PAIRED = [(c.name, name) for name, c in fields_paired(AutoEvalColumn) if not c.hidden]
 NUMERIC_INTERVALS = {
     "?": pd.Interval(-1, 0, closed="right"),
     "~35": pd.Interval(20, 45, closed="right"),
     "~60": pd.Interval(45, 70, closed="right"),
     "70+": pd.Interval(70, 10000, closed="right"),
+}

src/envs.py CHANGED Viewed

@@ -2,6 +2,8 @@ import os
 from huggingface_hub import HfApi
 # Info to change for your repository
 # ----------------------------------
 TOKEN = os.environ.get("TOKEN") # A read/write token for your org
@@ -24,3 +26,7 @@ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 API = HfApi(token=TOKEN)

 from huggingface_hub import HfApi
+import rsa
 # Info to change for your repository
 # ----------------------------------
 TOKEN = os.environ.get("TOKEN") # A read/write token for your org
 EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 API = HfApi(token=TOKEN)
+with open('pubkey.pem', 'rb') as f:
+    pub = f.read()
+    RSA_PUBKEY = rsa.PublicKey.load_pkcs1(pub)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -8,7 +8,7 @@ import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
@@ -23,7 +23,7 @@ class EvalResult:
     revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original # Original or Adapter
     architecture: str = "Unknown"
     license: str = "?"
@@ -91,39 +91,39 @@ class EvalResult:
             architecture=architecture
         )
-    def update_with_request_file(self, requests_path):
-        """Finds the relevant request file for the current model and updates info with it"""
-        request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
-        try:
-            with open(request_file, "r") as f:
-                request = json.load(f)
-            self.model_type = ModelType.from_str(request.get("model_type", ""))
-            self.weight_type = WeightType[request.get("weight_type", "Original")]
-            self.license = request.get("license", "?")
-            self.likes = request.get("likes", 0)
-            self.num_params = request.get("params", 0)
-            self.date = request.get("submitted_time", "")
-        except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
-            AutoEvalColumn.precision.name: self.precision.value.name,
-            AutoEvalColumn.model_type.name: self.model_type.value.name,
             # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
-            AutoEvalColumn.weight_type.name: self.weight_type.value.name,
-            AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
-            AutoEvalColumn.revision.name: self.revision,
             AutoEvalColumn.average.name: average,
-            AutoEvalColumn.license.name: self.license,
-            AutoEvalColumn.likes.name: self.likes,
-            AutoEvalColumn.params.name: self.num_params,
-            AutoEvalColumn.still_on_hub.name: self.still_on_hub,
         }
         for task in Tasks:
@@ -181,7 +181,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
-        eval_result.update_with_request_file(requests_path)
         print(f"Found result for {eval_result.full_model} with precision {eval_result.precision.value.name}")
         # Store results of same eval together

 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, AlgoType, Tasks, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
     revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
+    model_type: AlgoType = AlgoType.Unknown # Pretrained, fine tuned, ...
     weight_type: WeightType = WeightType.Original # Original or Adapter
     architecture: str = "Unknown"
     license: str = "?"
             architecture=architecture
         )
+    # def update_with_request_file(self, requests_path):
+    #     """Finds the relevant request file for the current model and updates info with it"""
+    #     request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
+    #     try:
+    #         with open(request_file, "r") as f:
+    #             request = json.load(f)
+    #         self.model_type = AlgoType.from_str(request.get("model_type", ""))
+    #         self.weight_type = WeightType[request.get("weight_type", "Original")]
+    #         self.license = request.get("license", "?")
+    #         self.likes = request.get("likes", 0)
+    #         self.num_params = request.get("params", 0)
+    #         self.date = request.get("submitted_time", "")
+    #     except Exception:
+    #         print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
     def to_dict(self):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
         data_dict = {
             "eval_name": self.eval_name,  # not a column, just a save name,
+            # AutoEvalColumn.precision.name: self.precision.value.name,
+            # AutoEvalColumn.model_type.name: self.model_type.value.name,
             # AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
+            # AutoEvalColumn.weight_type.name: self.weight_type.value.name,
+            # AutoEvalColumn.architecture.name: self.architecture,
             AutoEvalColumn.model.name: make_clickable_model(self.full_model),
+            # AutoEvalColumn.revision.name: self.revision,
             AutoEvalColumn.average.name: average,
+            # AutoEvalColumn.license.name: self.license,
+            # AutoEvalColumn.likes.name: self.likes,
+            # AutoEvalColumn.params.name: self.num_params,
+            # AutoEvalColumn.still_on_hub.name: self.still_on_hub,
         }
         for task in Tasks:
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
+        # eval_result.update_with_request_file(requests_path)
         print(f"Found result for {eval_result.full_model} with precision {eval_result.precision.value.name}")
         # Store results of same eval together

src/populate.py CHANGED Viewed

@@ -3,25 +3,64 @@ import os
 import pandas as pd
-from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
-def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
-    raw_data = get_raw_eval_results(results_path, requests_path)
-    print(raw_data)
-    all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     print(df, AutoEvalColumn.average.name)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
-    df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
-    return raw_data, df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:

 import pandas as pd
+from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
 from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
+def calc_average(row: pd.Series, benchmark_cols: list) -> float:
+    """Calculates the average of the benchmark columns that exist in the row"""
+    return row[[col for col in benchmark_cols if col in row]].mean()
+def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols_paired: list, cols_paired: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
+    # raw_data = get_raw_eval_results(results_path, requests_path)
+    # all_data_json = [v.to_dict() for v in raw_data]
+    all_data_json = []
+    benchmark_cols = [col[0] for col in benchmark_cols_paired]
+    with open('./master_table.json') as f:
+        content = json.load(f)
+        for key, val in content.items():
+            val['eval_name'] = val['id']
+            del val['id']
+            if 'link' in val and val['link'].strip():
+                val['Algorithm'] = model_hyperlink(val['link'], val['name'])
+            else:
+                val['Algorithm'] = val['name']
+            del val['name']
+            # fill in the missing benchmark columns as 0
+            for display_name, benchmark in benchmark_cols_paired:
+                if benchmark not in val:
+                    val[display_name] = 0
+                else:
+                    val[display_name] = val[benchmark]
+                    del val[benchmark]
+            # change all the keys to the display names
+            for display_name, col in cols_paired:
+                if display_name in val:
+                    pass
+                elif col in val:
+                    val[display_name] = val[col]
+                    del val[col]
+                else:
+                    val[display_name] = None
+            all_data_json.append(val)
+    print(f'All data json: {all_data_json}')
     df = pd.DataFrame.from_records(all_data_json)
+    df[AutoEvalColumn.average.name] = df.apply(lambda row: calc_average(row, benchmark_cols), axis=1)
     print(df, AutoEvalColumn.average.name)
     df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
+    df = df[cols].round(decimals=4)
     # filter out if any of the benchmarks have not been produced
     df = df[has_no_nan_values(df, benchmark_cols)]
+    return df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:

src/submission/submit.py CHANGED Viewed

@@ -2,6 +2,9 @@ import json
 import os
 import re
 from datetime import datetime, timezone
 from src.challenges.result_parsers import parse_challenge_result_dict
@@ -30,14 +33,16 @@ def add_new_eval(
     algo_link: str,
     submitter_email: str,
 ):
-    return_str = 'Success! Your submission will soon be added to the leaderboard.'
     # validate email and url
     if not parseaddr(submitter_email):
         return styled_error("Please enter a valid email address.")
     if algo_link.strip() and not urlparse(algo_link).scheme:
-        return styled_error("Please enter a valid URL.")
     # get file path
     try:
@@ -91,6 +96,20 @@ def add_new_eval(
         "update_timestamp": timestamp_filename,
     }
     for challenge, result in results_per_challenge.items():
         try:
             parsed_result: float = parse_challenge_result_dict(challenge, result)
@@ -103,8 +122,8 @@ def add_new_eval(
     # Get content of the master table from DATA_REPO
     try:
         master_table = {}
-        if API.file_exists(DATA_REPO, "master_table.json"):
-            API.hf_hub_download(DATA_REPO, "master_table.json", EVAL_REQUESTS_PATH, force_download=True)
             with open(f"{EVAL_REQUESTS_PATH}/master_table.json", "r") as f:
                 master_table = json.load(f)
         else:
@@ -134,3 +153,82 @@ def add_new_eval(
     )
     return styled_message(return_str)

 import os
 import re
 from datetime import datetime, timezone
+import rsa
+from src.envs import RSA_PUBKEY
 from src.challenges.result_parsers import parse_challenge_result_dict
     algo_link: str,
     submitter_email: str,
 ):
+    return_str = 'Success! Your submission will be added to the leaderboard within 24 hours.'
     # validate email and url
     if not parseaddr(submitter_email):
         return styled_error("Please enter a valid email address.")
+    submitter_email = rsa.encrypt(submitter_email.encode(), RSA_PUBKEY).hex()
     if algo_link.strip() and not urlparse(algo_link).scheme:
+        return styled_error("Please enter a valid URL (including the http/https protocol).")
     # get file path
     try:
         "update_timestamp": timestamp_filename,
     }
+    # Upload the metadata file
+    print("Uploading metadata file")
+    metadata_filename = f'./tmp_metadata_{algo_name_filename}_{timestamp_filename}.json'
+    with open(metadata_filename, 'w') as f:
+        f.write(json.dumps(eval_entry))
+    API.upload_file(
+        path_or_fileobj=metadata_filename,
+        path_in_repo=f'upload_history/{algo_name_filename}/{timestamp_filename}_metadata.json',
+        repo_id=DATA_REPO,
+        repo_type="dataset",
+        commit_message=f"Add metadata {algo_name} by {submitter_email} at {timestamp_filename}",
+    )
     for challenge, result in results_per_challenge.items():
         try:
             parsed_result: float = parse_challenge_result_dict(challenge, result)
     # Get content of the master table from DATA_REPO
     try:
         master_table = {}
+        if API.file_exists(DATA_REPO, "master_table.json", repo_type='dataset'):
+            API.hf_hub_download(DATA_REPO, "master_table.json", local_dir=EVAL_REQUESTS_PATH, repo_type='dataset', force_download=True)
             with open(f"{EVAL_REQUESTS_PATH}/master_table.json", "r") as f:
                 master_table = json.load(f)
         else:
     )
     return styled_message(return_str)
+def add_new_challenge(
+    submission_files,
+    challenge_name: str,
+    challenge_info: str,
+    challenge_link: str,
+    submitter_email: str,
+):
+    return_str = 'Success! We are working to incorporate your submitted challenge into the leaderboard, and will get back to you when we encounter problems.'
+    # validate email and url
+    if not parseaddr(submitter_email):
+        return styled_error("Please enter a valid email address.")
+    submitter_email = rsa.encrypt(submitter_email.encode(), RSA_PUBKEY).hex()
+    if challenge_link.strip() and not urlparse(challenge_link).scheme:
+        return styled_error("Please enter a valid URL (including the http/https protocol).")
+    # get file path
+    if submission_files is None:
+        submission_files = []
+    else:
+        try:
+            assert isinstance(submission_files, list)
+            assert all(isinstance(file, str) for file in submission_files)
+        except:
+            return styled_error("Invalid submission file: File path not found.")
+    # format the challenge name
+    challenge_name = challenge_name.strip()
+    challenge_name_filename = re.sub(r"[^a-zA-Z0-9]+", "-", challenge_name).lower()
+    timestamp_filename = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S")
+    for num, file_path in enumerate(submission_files):
+        # parse the submission file
+        try:
+            with open(file_path, "r") as f:
+                submission_data = json.load(f)
+        except JSONDecodeError:
+            return styled_error(f"Invalid submission file {os.path.basename(file_path)}: JSON parsing failed.")
+        try:
+            assert isinstance(submission_data, dict)
+            assert all(isinstance(result, dict) for result in submission_data.values())
+        except (AssertionError, KeyError):
+            return styled_error(f"Invalid submission file {os.path.basename(file_path)}: Incorrect organization of the JSON file.")
+        print("Uploading submission file")
+        API.upload_file(
+            path_or_fileobj=file_path,
+            path_in_repo=f'upload_history/{challenge_name_filename}/{timestamp_filename}_file{num}_{os.path.basename(file_path)}.json',
+            repo_id=DATA_REPO,
+            repo_type="dataset",
+            commit_message=f"Add {challenge_name} to eval queue by {submitter_email} at {timestamp_filename}",
+        )
+    print("Uploading metadata file")
+    filename = f'./tmp_metadata_{challenge_name_filename}_{timestamp_filename}.json'
+    with open(filename, 'w') as f:
+        f.write(json.dumps({
+            "name": challenge_name,
+            "info": challenge_info,
+            "link": challenge_link,
+            "email": submitter_email,
+            "update_timestamp": timestamp_filename,
+        }))
+    API.upload_file(
+        path_or_fileobj=filename,
+        path_in_repo=f'upload_history/{challenge_name_filename}/{timestamp_filename}_metadata.json',
+        repo_id=DATA_REPO,
+        repo_type="dataset",
+        commit_message=f"Add metadata {challenge_name} by {submitter_email} at {timestamp_filename}",
+    )
+    return styled_message(return_str)

tempCodeRunnerFile.python ADDED Viewed

	@@ -0,0 +1,17 @@

+import rsa
+content = 'hello,world!@#$%^&*()_+'
+with open('pubkey.pem', mode='rb') as f:
+    pubkey = rsa.PublicKey.load_pkcs1(f.read())
+with open('privkey.pem', mode='rb') as f:
+    prikey = rsa.PrivateKey.load_pkcs1(f.read())
+crypto = rsa.encrypt(content.encode(), pubkey)
+print(crypto.hex())
+crypto = bytes.fromhex('24cb974ad7d5673bfc9e7bca3ad66d2365f52ef5cb6cdbc72f1bb5877cf358fcffc3ed34682b205d8b4a8982e979e9ef7f6f2250220f271fcbb2f733ee68a511acb49edbd437fc2798b7eaf7d890c72a7c5ce98254940fa1f40c44deecacd57b97576570f30a3704ba31949cd37e6ed5e594827e524d5a03341293cf767e38af2d8dbe089d370f12454e3290c6071d861940611855dbcd62f7b5dda5af2afb91c8d7c8545242d5bb9b88f49feb969f6d18ad6eea9947e8ced1a0e0aae4b025eb0225753dca9e796652def27bff7d77b1a5c97cd95e9bf638d2e33cd21b0bb0e9d3e3b0fc490b181ed6933ea555792f86ac0ea438479f4d66f1404d93924eeaff05f2d533ba710e11d7018ecda1eb1490c60e23a92d855a0de2f1d811382bf6b49bafd69c507b84d2e1ab7816fcfdc7393d4207ebd7260ec9711e9a180b3263e16e590f7eb15b163f4ccc55147530aa8ba6c34272060d8befc19703c1d1e199cbca80fa47869d8f80a1053d227abd519f9ce17bb09d676fda6ca5c3e547231df9aafba2a8ced35ace7b79087129b373535c3057b6042484ace1d03d982605ea268d6c454f5096b54c528f8373fb4b90d2b7cdb658d7cbd31035efddbbc2e443d0a106d450392d503cb3f2082b4607d070aa090649bc443fe54e7c95ed6059b3aa1a42154169a09c3481ee863c41cf6a7ee2ce05670b5a5cce9259a41669b4e647')
+content2 = rsa.decrypt(crypto, prikey).decode()
+print(content2, len(content2), len(content2.encode()))