Spaces:
Runtime error
Runtime error
Tianyi (Alex) Qiu
commited on
Commit
·
139f14b
1
Parent(s):
24a3e20
finish framework (esp. submit challenge & encrypt)
Browse files- .gitignore +4 -1
- app.py +57 -14
- pubkey.pem +13 -0
- src/about.py +53 -34
- src/display/utils.py +41 -36
- src/envs.py +6 -0
- src/leaderboard/read_evals.py +27 -27
- src/populate.py +46 -7
- src/submission/submit.py +102 -4
- tempCodeRunnerFile.python +17 -0
.gitignore
CHANGED
@@ -15,4 +15,7 @@ logs/
|
|
15 |
demo-leaderboard/
|
16 |
results/
|
17 |
upload_history/
|
18 |
-
master_table.json
|
|
|
|
|
|
|
|
15 |
demo-leaderboard/
|
16 |
results/
|
17 |
upload_history/
|
18 |
+
master_table.json
|
19 |
+
|
20 |
+
priv*
|
21 |
+
tmp*
|
app.py
CHANGED
@@ -11,24 +11,27 @@ from src.about import (
|
|
11 |
INTRODUCTION_TEXT,
|
12 |
LLM_BENCHMARKS_TEXT,
|
13 |
TITLE,
|
|
|
|
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
17 |
BENCHMARK_COLS,
|
18 |
COLS,
|
|
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
NUMERIC_INTERVALS,
|
22 |
TYPES,
|
23 |
AutoEvalColumn,
|
24 |
-
|
25 |
fields,
|
26 |
WeightType,
|
27 |
Precision
|
28 |
)
|
29 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DATA_REPO, REPO_ID, TOKEN, REQUESTS_REPO_PATH, RESULTS_REPO_PATH, CACHE_PATH
|
30 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
31 |
-
from src.submission.submit import add_new_eval
|
32 |
|
33 |
|
34 |
def restart_space():
|
@@ -43,7 +46,7 @@ except Exception:
|
|
43 |
print("Could not download the dataset. Please check your token and network connection.")
|
44 |
restart_space()
|
45 |
|
46 |
-
|
47 |
leaderboard_df = original_df.copy()
|
48 |
|
49 |
# Searching and filtering
|
@@ -52,6 +55,12 @@ def update_table(
|
|
52 |
columns: list,
|
53 |
):
|
54 |
df = select_columns(hidden_df, columns)
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
return df
|
56 |
|
57 |
|
@@ -68,6 +77,9 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
|
68 |
filtered_df = df[
|
69 |
always_here_cols + [c for c in COLS if c in df.columns and c in columns]
|
70 |
]
|
|
|
|
|
|
|
71 |
return filtered_df
|
72 |
|
73 |
demo = gr.Blocks(css=custom_css)
|
@@ -124,23 +136,22 @@ with demo:
|
|
124 |
queue=True,
|
125 |
)
|
126 |
|
127 |
-
with gr.TabItem("Submit Algorithm", elem_id="llm-benchmark-tab-table", id=
|
128 |
-
with gr.
|
129 |
-
|
130 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
131 |
|
132 |
with gr.Row():
|
133 |
-
gr.Markdown("# Submission Form\nSubmitted files will be stored and made public.", elem_classes="markdown-text")
|
134 |
|
135 |
with gr.Row():
|
136 |
with gr.Column():
|
137 |
-
submission_file = gr.File(label="Evaluation result (JSON file generated by
|
138 |
|
139 |
with gr.Column():
|
140 |
algo_name = gr.Textbox(label="Algorithm display name")
|
141 |
-
algo_info = gr.Textbox(label="Comments & extra information")
|
142 |
-
algo_link = gr.Textbox(label="One external link (e.g. GitHub repo, paper, project page)")
|
143 |
-
submitter_email = gr.Textbox(label="Email address for contact (will be
|
144 |
|
145 |
submit_button = gr.Button("Submit Algorithm")
|
146 |
submission_result = gr.Markdown()
|
@@ -155,9 +166,41 @@ with demo:
|
|
155 |
],
|
156 |
submission_result,
|
157 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
with gr.Row():
|
160 |
-
with gr.Accordion("About & Citation", open=False):
|
|
|
161 |
citation_button = gr.Textbox(
|
162 |
value=CITATION_BUTTON_TEXT,
|
163 |
label=CITATION_BUTTON_LABEL,
|
|
|
11 |
INTRODUCTION_TEXT,
|
12 |
LLM_BENCHMARKS_TEXT,
|
13 |
TITLE,
|
14 |
+
ABOUT_TEXT,
|
15 |
+
SUBMIT_CHALLENGE_TEXT,
|
16 |
)
|
17 |
from src.display.css_html_js import custom_css
|
18 |
from src.display.utils import (
|
19 |
BENCHMARK_COLS,
|
20 |
COLS,
|
21 |
+
COLS_PAIRED,
|
22 |
EVAL_COLS,
|
23 |
EVAL_TYPES,
|
24 |
NUMERIC_INTERVALS,
|
25 |
TYPES,
|
26 |
AutoEvalColumn,
|
27 |
+
AlgoType,
|
28 |
fields,
|
29 |
WeightType,
|
30 |
Precision
|
31 |
)
|
32 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, DATA_REPO, REPO_ID, TOKEN, REQUESTS_REPO_PATH, RESULTS_REPO_PATH, CACHE_PATH
|
33 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df, calc_average
|
34 |
+
from src.submission.submit import add_new_eval, add_new_challenge
|
35 |
|
36 |
|
37 |
def restart_space():
|
|
|
46 |
print("Could not download the dataset. Please check your token and network connection.")
|
47 |
restart_space()
|
48 |
|
49 |
+
original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS, COLS_PAIRED)
|
50 |
leaderboard_df = original_df.copy()
|
51 |
|
52 |
# Searching and filtering
|
|
|
55 |
columns: list,
|
56 |
):
|
57 |
df = select_columns(hidden_df, columns)
|
58 |
+
if AutoEvalColumn.average.name in df.columns:
|
59 |
+
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
60 |
+
df[[AutoEvalColumn.average.name]] = df[[AutoEvalColumn.average.name]].round(decimals=4)
|
61 |
+
elif AutoEvalColumn.model.name in df.columns:
|
62 |
+
df = df.sort_values(by=[AutoEvalColumn.model.name], ascending=True)
|
63 |
+
|
64 |
return df
|
65 |
|
66 |
|
|
|
77 |
filtered_df = df[
|
78 |
always_here_cols + [c for c in COLS if c in df.columns and c in columns]
|
79 |
]
|
80 |
+
if AutoEvalColumn.average.name in filtered_df.columns:
|
81 |
+
filtered_df[AutoEvalColumn.average.name] = filtered_df.apply(lambda row: calc_average(row, [col[0] for col in BENCHMARK_COLS]), axis=1)
|
82 |
+
|
83 |
return filtered_df
|
84 |
|
85 |
demo = gr.Blocks(css=custom_css)
|
|
|
136 |
queue=True,
|
137 |
)
|
138 |
|
139 |
+
with gr.TabItem("Submit Algorithm", elem_id="llm-benchmark-tab-table", id=1):
|
140 |
+
with gr.Row():
|
141 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
142 |
|
143 |
with gr.Row():
|
144 |
+
gr.Markdown("# Submission Form\nSubmitted files will be stored and made public. If you have any questions, please [contact](mailto:qiutianyi.qty@gmail.com) the ProgressGym team.", elem_classes="markdown-text")
|
145 |
|
146 |
with gr.Row():
|
147 |
with gr.Column():
|
148 |
+
submission_file = gr.File(label="Evaluation result (JSON file generated by run_benchmark.py, one algorithm on all challenges)", file_types=['.json'])
|
149 |
|
150 |
with gr.Column():
|
151 |
algo_name = gr.Textbox(label="Algorithm display name")
|
152 |
+
algo_info = gr.Textbox(label="Optional: Comments & extra information")
|
153 |
+
algo_link = gr.Textbox(label="Optional: One external link (e.g. GitHub repo, paper, project page)")
|
154 |
+
submitter_email = gr.Textbox(label="Optional: Email address for contact (will be encrypted with RSA-2048 for privacy before storage and public archiving)")
|
155 |
|
156 |
submit_button = gr.Button("Submit Algorithm")
|
157 |
submission_result = gr.Markdown()
|
|
|
166 |
],
|
167 |
submission_result,
|
168 |
)
|
169 |
+
|
170 |
+
with gr.TabItem("Submit Challenge", elem_id="llm-benchmark-tab-table", id=2):
|
171 |
+
with gr.Row():
|
172 |
+
gr.Markdown(SUBMIT_CHALLENGE_TEXT, elem_classes="markdown-text")
|
173 |
+
|
174 |
+
with gr.Row():
|
175 |
+
gr.Markdown("# Submission Form\nSubmitted files will be stored and made public. If you have any questions, please [contact](mailto:qiutianyi.qty@gmail.com) the ProgressGym team.", elem_classes="markdown-text")
|
176 |
+
|
177 |
+
with gr.Row():
|
178 |
+
with gr.Column():
|
179 |
+
challenge_submission_file = gr.File(label="Optional: Evaluation results (JSON file(s) generated by run_benchmark.py, testing all algorithms on your challenge)", file_count='multiple', file_types=['.json'])
|
180 |
+
|
181 |
+
with gr.Column():
|
182 |
+
challenge_name = gr.Textbox(label="Challenge display name")
|
183 |
+
challenge_info = gr.Textbox(label="Comments & extra information", lines=3)
|
184 |
+
challenge_link = gr.Textbox(label="One external link (e.g. GitHub repo, paper, project page)")
|
185 |
+
challenge_submitter_email = gr.Textbox(label="Email address for contact (will be encrypted with RSA-2048 for privacy before storage and public archiving)")
|
186 |
+
|
187 |
+
challenge_submit_button = gr.Button("Submit Challenge")
|
188 |
+
challenge_submission_result = gr.Markdown()
|
189 |
+
challenge_submit_button.click(
|
190 |
+
add_new_challenge,
|
191 |
+
[
|
192 |
+
challenge_submission_file,
|
193 |
+
challenge_name,
|
194 |
+
challenge_info,
|
195 |
+
challenge_link,
|
196 |
+
challenge_submitter_email,
|
197 |
+
],
|
198 |
+
challenge_submission_result,
|
199 |
+
)
|
200 |
|
201 |
with gr.Row():
|
202 |
+
with gr.Accordion("About & Citation 📖", open=False):
|
203 |
+
about_text = gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
204 |
citation_button = gr.Textbox(
|
205 |
value=CITATION_BUTTON_TEXT,
|
206 |
label=CITATION_BUTTON_LABEL,
|
pubkey.pem
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN RSA PUBLIC KEY-----
|
2 |
+
MIICCgKCAgEAgEI1f9767WuoW0spjJYIXDdOrTrByOPc0cx/C5jeTKMQMxy11RrT
|
3 |
+
DDnZQl226AV68t6Wf86Z52XEFrkgUXRXlCfXCCM+kRxNkxQwA0faxlR2NI1576tu
|
4 |
+
OQBp6LL+Vmr2g18JE4kHo9T7TIkOmAiy21Vo/YILJgQ9cimgDu2KN3F9hqBhdNaO
|
5 |
+
4n6L2g2TcActNt/ECtwVahtIrqZd8MZV0lYe1ieHR/d4KiPsnZ6FULpOG+ynqX4k
|
6 |
+
SzcD3kgiWb9QYF0GHV/pUkpUVmAkjyA0BGe360/au+NJ6oxHrLVA5ephvwBt9st6
|
7 |
+
P3xWvucuP30/YcQECGOz48DnEDKfXZ+4mYfMS9mUEEcP7qjoTN+wOGCRV8Z48U4X
|
8 |
+
KIP3r5NuT3+qY1uSO14fv9Uu9VvJgaySBBrAEk7n0Wf4ywL/5nt8C9/bivZIiDT+
|
9 |
+
88cSKUy6mnHutB23pwy7UK/7jR+NH6h90zdJDHBnnQp8UEct1bsJxqUOTw3uTidc
|
10 |
+
Q3w4iRwXC3/A1Y7LD31qZ2AK7AkibTOJX8lXih5fbk/21PUEVqj3XBsOVuW3E1KO
|
11 |
+
iXCQ96SS+gk927gg/F79PKokXPup8+0t0xP8la1fZA0DENyyhflS3T/VDPYZ2rBd
|
12 |
+
VFD6Xf5UPC/NaAdONaLemEBYYMirXAmIs1l14SIdrtQPI72hdlxVi5UCAwEAAQ==
|
13 |
+
-----END RSA PUBLIC KEY-----
|
src/about.py
CHANGED
@@ -14,9 +14,9 @@ class Tasks(Enum):
|
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
# task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
-
task0 = Task("Follow", "accuracy", "Follow")
|
18 |
-
task1 = Task("Predict", "accuracy", "Predict")
|
19 |
-
task2 = Task("Coevolve", "accuracy", "Coevolve")
|
20 |
|
21 |
NUM_FEWSHOT = 0 # Change with your few shot
|
22 |
# ---------------------------------------------------
|
@@ -24,52 +24,71 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
24 |
|
25 |
|
26 |
# Your leaderboard name
|
27 |
-
TITLE = """<h1 align="center" id="space-title">
|
28 |
|
29 |
# What does your leaderboard evaluate?
|
30 |
INTRODUCTION_TEXT = """
|
31 |
-
|
|
|
|
|
32 |
"""
|
33 |
|
34 |
# Which evaluations are you running? how can people reproduce what you have?
|
35 |
-
LLM_BENCHMARKS_TEXT = f"""
|
36 |
-
## How it works
|
37 |
|
38 |
-
|
39 |
-
|
40 |
|
41 |
-
|
42 |
|
43 |
-
|
44 |
-
## Some good practices before submitting a model
|
45 |
|
46 |
-
|
47 |
-
```python
|
48 |
-
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
49 |
-
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
50 |
-
model = AutoModel.from_pretrained("your model name", revision=revision)
|
51 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
52 |
-
```
|
53 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
54 |
|
55 |
-
|
56 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
57 |
|
58 |
-
|
59 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
60 |
|
61 |
-
|
62 |
-
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
63 |
|
64 |
-
|
65 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
71 |
"""
|
72 |
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
# task0 = Task("anli_r1", "acc", "ANLI")
|
16 |
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
+
task0 = Task("Follow", "accuracy", "Follow ↑")
|
18 |
+
task1 = Task("Predict", "accuracy", "Predict ↑")
|
19 |
+
task2 = Task("Coevolve", "accuracy", "Coevolve ↑")
|
20 |
|
21 |
NUM_FEWSHOT = 0 # Change with your few shot
|
22 |
# ---------------------------------------------------
|
|
|
24 |
|
25 |
|
26 |
# Your leaderboard name
|
27 |
+
TITLE = """<h1 align="center" id="space-title">ProgressGym Leaderboard</h1>"""
|
28 |
|
29 |
# What does your leaderboard evaluate?
|
30 |
INTRODUCTION_TEXT = """
|
31 |
+
Human values are evolving and have undergone huge, continual progress over the past millennium. Values embedded into the LLMs need to undergo the same process, or else we risk *locking-in* current human values by putting humans into an echo chamber of like-minded LLMs. This concern is especially salient when LLMs have become personal assistants, romantic partners, K-12 educators, etc., and [psychological studies](https://arxiv.org/abs/2302.00560) have demonstrated very significant impact of LLMs on human views.
|
32 |
+
|
33 |
+
ProgressGym-LeaderBoard is an open leaderboard for *progress alignment* algorithms - algorithms which learn and emulate the mechanics of moral progress, in order to facilitate continual improvements in real-world value decisions. Refer to the [ProgressGym paper](https://arxiv.org/abs/2406.20087) for more details.
|
34 |
"""
|
35 |
|
36 |
# Which evaluations are you running? how can people reproduce what you have?
|
37 |
+
LLM_BENCHMARKS_TEXT = f""""""
|
|
|
38 |
|
39 |
+
EVALUATION_QUEUE_TEXT = """
|
40 |
+
## Steps to submit your progress alignment algorithm
|
41 |
|
42 |
+
To submit your progress alignment algorithm, please follow the following steps.
|
43 |
|
44 |
+
#### Step 1: Clone the ProgressGym codebase
|
|
|
45 |
|
46 |
+
[More explanation coming]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
#### Step 2: Implement your progress alignment algorithm as an `Examinee` class
|
|
|
49 |
|
50 |
+
[More explanation coming]
|
|
|
51 |
|
52 |
+
#### Step 3: Run the benchmark script on all challenges
|
|
|
53 |
|
54 |
+
[More explanation coming]
|
|
|
55 |
|
56 |
+
#### Step 4: Submit the generated results as a JSON file
|
57 |
+
|
58 |
+
[More explanation coming]
|
|
|
59 |
"""
|
60 |
|
61 |
+
ABOUT_TEXT = """ProgressGym-LeaderBoard is tightly coupled with the ProgressGym experimental framework for progress alignment research, which provides historical datasets, historical LLMs, simulation environments, algorithm implementations, and benchmarks on progress alignment challenges. Please refer to the [ProgressGym paper](https://arxiv.org/abs/2406.20087) for more details."""
|
62 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite the project"
|
63 |
+
CITATION_BUTTON_TEXT = r"""@article{progressgym,
|
64 |
+
title={ProgressGym: Alignment with a Millennium of Moral Progress},
|
65 |
+
author={Tianyi Qiu and Yang Zhang and Xuchuan Huang and Jasmine Xinze Li and Jiaming Ji and Yaodong Yang},
|
66 |
+
journal={arXiv preprint arXiv:2406.20087},
|
67 |
+
eprint={2406.20087},
|
68 |
+
eprinttype = {arXiv},
|
69 |
+
year={2024}
|
70 |
+
}
|
71 |
"""
|
72 |
+
|
73 |
+
|
74 |
+
SUBMIT_CHALLENGE_TEXT = """
|
75 |
+
## Steps to submit your progress alignment challenge
|
76 |
+
|
77 |
+
To submit your progress alignment challenge, please follow the following steps.
|
78 |
+
|
79 |
+
#### Step 1: Clone the ProgressGym codebase
|
80 |
+
|
81 |
+
[More explanation coming]
|
82 |
+
|
83 |
+
#### Step 2: Implement your progress alignment challenge as a `Judge` class
|
84 |
+
|
85 |
+
[More explanation coming]
|
86 |
+
|
87 |
+
#### Step 3 (optional but recommended): Run the benchmark script on all challenges
|
88 |
+
|
89 |
+
[More explanation coming]
|
90 |
+
|
91 |
+
#### Step 4: Submit the link to your codebase and (optional but recommended) the generated results as a JSON file
|
92 |
+
|
93 |
+
[More explanation coming]
|
94 |
+
"""
|
src/display/utils.py
CHANGED
@@ -8,6 +8,8 @@ from src.about import Tasks
|
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
|
|
|
|
|
11 |
|
12 |
# These classes are for user facing column names,
|
13 |
# to avoid having to change them all around the code
|
@@ -25,23 +27,25 @@ auto_eval_column_dict = []
|
|
25 |
|
26 |
# Init
|
27 |
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
28 |
-
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("
|
29 |
|
30 |
#Scores
|
31 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average
|
32 |
for task in Tasks:
|
33 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
34 |
|
35 |
-
#
|
36 |
-
auto_eval_column_dict.append(["
|
37 |
-
auto_eval_column_dict.append(["
|
38 |
-
auto_eval_column_dict.append(["
|
39 |
-
auto_eval_column_dict.append(["
|
40 |
-
auto_eval_column_dict.append(["
|
41 |
-
auto_eval_column_dict.append(["
|
42 |
-
auto_eval_column_dict.append(["
|
43 |
-
auto_eval_column_dict.append(["
|
44 |
-
auto_eval_column_dict.append(["
|
|
|
|
|
45 |
|
46 |
# We use make dataclass to dynamically fill the scores from Tasks
|
47 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
@@ -58,18 +62,18 @@ class EvalQueueColumn: # Queue column
|
|
58 |
|
59 |
## All the model information that we might need
|
60 |
@dataclass
|
61 |
-
class
|
62 |
name: str
|
63 |
display_name: str = ""
|
64 |
symbol: str = "" # emoji
|
65 |
|
66 |
|
67 |
-
class
|
68 |
-
PT =
|
69 |
-
FT =
|
70 |
-
IFT =
|
71 |
-
RL =
|
72 |
-
Unknown =
|
73 |
|
74 |
def to_str(self, separator=" "):
|
75 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
@@ -77,28 +81,28 @@ class ModelType(Enum):
|
|
77 |
@staticmethod
|
78 |
def from_str(type):
|
79 |
if "fine-tuned" in type or "🔶" in type:
|
80 |
-
return
|
81 |
if "pretrained" in type or "🟢" in type:
|
82 |
-
return
|
83 |
if "RL-tuned" in type or "🟦" in type:
|
84 |
-
return
|
85 |
if "instruction-tuned" in type or "⭕" in type:
|
86 |
-
return
|
87 |
-
return
|
88 |
|
89 |
class WeightType(Enum):
|
90 |
-
Adapter =
|
91 |
-
Original =
|
92 |
-
Delta =
|
93 |
|
94 |
class Precision(Enum):
|
95 |
-
float16 =
|
96 |
-
bfloat16 =
|
97 |
-
float32 =
|
98 |
-
#qt_8bit =
|
99 |
-
#qt_4bit =
|
100 |
-
#qt_GPTQ =
|
101 |
-
Unknown =
|
102 |
|
103 |
def from_str(precision):
|
104 |
if precision in ["torch.float16", "float16"]:
|
@@ -124,7 +128,8 @@ TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default a
|
|
124 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
125 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
126 |
|
127 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
128 |
|
129 |
NUMERIC_INTERVALS = {
|
130 |
"?": pd.Interval(-1, 0, closed="right"),
|
@@ -135,4 +140,4 @@ NUMERIC_INTERVALS = {
|
|
135 |
"~35": pd.Interval(20, 45, closed="right"),
|
136 |
"~60": pd.Interval(45, 70, closed="right"),
|
137 |
"70+": pd.Interval(70, 10000, closed="right"),
|
138 |
-
}
|
|
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
10 |
|
11 |
+
def fields_paired(raw_class):
|
12 |
+
return [(k,v) for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
13 |
|
14 |
# These classes are for user facing column names,
|
15 |
# to avoid having to change them all around the code
|
|
|
27 |
|
28 |
# Init
|
29 |
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
30 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Algorithm", "markdown", True, never_hidden=True)])
|
31 |
|
32 |
#Scores
|
33 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ↑", "number", True)])
|
34 |
for task in Tasks:
|
35 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
36 |
|
37 |
+
# Algorithm information
|
38 |
+
auto_eval_column_dict.append(["info", ColumnContent, ColumnContent("Info", "str", True)])
|
39 |
+
auto_eval_column_dict.append(["update_timestamp", ColumnContent, ColumnContent("Update timestamp", "str", False)])
|
40 |
+
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
41 |
+
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
42 |
+
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
43 |
+
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
44 |
+
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
45 |
+
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
46 |
+
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
47 |
+
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
48 |
+
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Algorithm sha", "str", False, False)])
|
49 |
|
50 |
# We use make dataclass to dynamically fill the scores from Tasks
|
51 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
62 |
|
63 |
## All the model information that we might need
|
64 |
@dataclass
|
65 |
+
class AlgoDetails:
|
66 |
name: str
|
67 |
display_name: str = ""
|
68 |
symbol: str = "" # emoji
|
69 |
|
70 |
|
71 |
+
class AlgoType(Enum):
|
72 |
+
PT = AlgoDetails(name="pretrained", symbol="🟢")
|
73 |
+
FT = AlgoDetails(name="fine-tuned", symbol="🔶")
|
74 |
+
IFT = AlgoDetails(name="instruction-tuned", symbol="⭕")
|
75 |
+
RL = AlgoDetails(name="RL-tuned", symbol="🟦")
|
76 |
+
Unknown = AlgoDetails(name="", symbol="?")
|
77 |
|
78 |
def to_str(self, separator=" "):
|
79 |
return f"{self.value.symbol}{separator}{self.value.name}"
|
|
|
81 |
@staticmethod
|
82 |
def from_str(type):
|
83 |
if "fine-tuned" in type or "🔶" in type:
|
84 |
+
return AlgoType.FT
|
85 |
if "pretrained" in type or "🟢" in type:
|
86 |
+
return AlgoType.PT
|
87 |
if "RL-tuned" in type or "🟦" in type:
|
88 |
+
return AlgoType.RL
|
89 |
if "instruction-tuned" in type or "⭕" in type:
|
90 |
+
return AlgoType.IFT
|
91 |
+
return AlgoType.Unknown
|
92 |
|
93 |
class WeightType(Enum):
|
94 |
+
Adapter = AlgoDetails("Adapter")
|
95 |
+
Original = AlgoDetails("Original")
|
96 |
+
Delta = AlgoDetails("Delta")
|
97 |
|
98 |
class Precision(Enum):
|
99 |
+
float16 = AlgoDetails("float16")
|
100 |
+
bfloat16 = AlgoDetails("bfloat16")
|
101 |
+
float32 = AlgoDetails("float32")
|
102 |
+
#qt_8bit = AlgoDetails("8bit")
|
103 |
+
#qt_4bit = AlgoDetails("4bit")
|
104 |
+
#qt_GPTQ = AlgoDetails("GPTQ")
|
105 |
+
Unknown = AlgoDetails("?")
|
106 |
|
107 |
def from_str(precision):
|
108 |
if precision in ["torch.float16", "float16"]:
|
|
|
128 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
129 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
130 |
|
131 |
+
BENCHMARK_COLS = [(t.value.col_name, t.value.benchmark) for t in Tasks]
|
132 |
+
COLS_PAIRED = [(c.name, name) for name, c in fields_paired(AutoEvalColumn) if not c.hidden]
|
133 |
|
134 |
NUMERIC_INTERVALS = {
|
135 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
140 |
"~35": pd.Interval(20, 45, closed="right"),
|
141 |
"~60": pd.Interval(45, 70, closed="right"),
|
142 |
"70+": pd.Interval(70, 10000, closed="right"),
|
143 |
+
}
|
src/envs.py
CHANGED
@@ -2,6 +2,8 @@ import os
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
|
|
|
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
@@ -24,3 +26,7 @@ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
|
24 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
25 |
|
26 |
API = HfApi(token=TOKEN)
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
+
import rsa
|
6 |
+
|
7 |
# Info to change for your repository
|
8 |
# ----------------------------------
|
9 |
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
|
|
26 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
27 |
|
28 |
API = HfApi(token=TOKEN)
|
29 |
+
|
30 |
+
with open('pubkey.pem', 'rb') as f:
|
31 |
+
pub = f.read()
|
32 |
+
RSA_PUBKEY = rsa.PublicKey.load_pkcs1(pub)
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn,
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -23,7 +23,7 @@ class EvalResult:
|
|
23 |
revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
precision: Precision = Precision.Unknown
|
26 |
-
model_type:
|
27 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
architecture: str = "Unknown"
|
29 |
license: str = "?"
|
@@ -91,39 +91,39 @@ class EvalResult:
|
|
91 |
architecture=architecture
|
92 |
)
|
93 |
|
94 |
-
def update_with_request_file(self, requests_path):
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
|
110 |
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
-
AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
-
AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
# AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
-
AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
-
AutoEvalColumn.architecture.name: self.architecture,
|
120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
-
AutoEvalColumn.revision.name: self.revision,
|
122 |
AutoEvalColumn.average.name: average,
|
123 |
-
AutoEvalColumn.license.name: self.license,
|
124 |
-
AutoEvalColumn.likes.name: self.likes,
|
125 |
-
AutoEvalColumn.params.name: self.num_params,
|
126 |
-
AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
}
|
128 |
|
129 |
for task in Tasks:
|
@@ -181,7 +181,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
181 |
for model_result_filepath in model_result_filepaths:
|
182 |
# Creation of result
|
183 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
184 |
-
eval_result.update_with_request_file(requests_path)
|
185 |
print(f"Found result for {eval_result.full_model} with precision {eval_result.precision.value.name}")
|
186 |
|
187 |
# Store results of same eval together
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, AlgoType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
|
|
23 |
revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
precision: Precision = Precision.Unknown
|
26 |
+
model_type: AlgoType = AlgoType.Unknown # Pretrained, fine tuned, ...
|
27 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
architecture: str = "Unknown"
|
29 |
license: str = "?"
|
|
|
91 |
architecture=architecture
|
92 |
)
|
93 |
|
94 |
+
# def update_with_request_file(self, requests_path):
|
95 |
+
# """Finds the relevant request file for the current model and updates info with it"""
|
96 |
+
# request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
|
97 |
+
|
98 |
+
# try:
|
99 |
+
# with open(request_file, "r") as f:
|
100 |
+
# request = json.load(f)
|
101 |
+
# self.model_type = AlgoType.from_str(request.get("model_type", ""))
|
102 |
+
# self.weight_type = WeightType[request.get("weight_type", "Original")]
|
103 |
+
# self.license = request.get("license", "?")
|
104 |
+
# self.likes = request.get("likes", 0)
|
105 |
+
# self.num_params = request.get("params", 0)
|
106 |
+
# self.date = request.get("submitted_time", "")
|
107 |
+
# except Exception:
|
108 |
+
# print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
|
109 |
|
110 |
def to_dict(self):
|
111 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
112 |
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
113 |
data_dict = {
|
114 |
"eval_name": self.eval_name, # not a column, just a save name,
|
115 |
+
# AutoEvalColumn.precision.name: self.precision.value.name,
|
116 |
+
# AutoEvalColumn.model_type.name: self.model_type.value.name,
|
117 |
# AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
|
118 |
+
# AutoEvalColumn.weight_type.name: self.weight_type.value.name,
|
119 |
+
# AutoEvalColumn.architecture.name: self.architecture,
|
120 |
AutoEvalColumn.model.name: make_clickable_model(self.full_model),
|
121 |
+
# AutoEvalColumn.revision.name: self.revision,
|
122 |
AutoEvalColumn.average.name: average,
|
123 |
+
# AutoEvalColumn.license.name: self.license,
|
124 |
+
# AutoEvalColumn.likes.name: self.likes,
|
125 |
+
# AutoEvalColumn.params.name: self.num_params,
|
126 |
+
# AutoEvalColumn.still_on_hub.name: self.still_on_hub,
|
127 |
}
|
128 |
|
129 |
for task in Tasks:
|
|
|
181 |
for model_result_filepath in model_result_filepaths:
|
182 |
# Creation of result
|
183 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
184 |
+
# eval_result.update_with_request_file(requests_path)
|
185 |
print(f"Found result for {eval_result.full_model} with precision {eval_result.precision.value.name}")
|
186 |
|
187 |
# Store results of same eval together
|
src/populate.py
CHANGED
@@ -3,25 +3,64 @@ import os
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
|
|
|
|
|
|
10 |
|
11 |
-
def get_leaderboard_df(results_path: str, requests_path: str, cols: list,
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
-
|
15 |
-
all_data_json = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
18 |
print(df, AutoEvalColumn.average.name)
|
19 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
20 |
-
df = df[cols].round(decimals=
|
21 |
|
22 |
# filter out if any of the benchmarks have not been produced
|
23 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
24 |
-
return
|
25 |
|
26 |
|
27 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.display.formatting import has_no_nan_values, make_clickable_model, model_hyperlink
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
+
def calc_average(row: pd.Series, benchmark_cols: list) -> float:
|
11 |
+
"""Calculates the average of the benchmark columns that exist in the row"""
|
12 |
+
return row[[col for col in benchmark_cols if col in row]].mean()
|
13 |
|
14 |
+
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols_paired: list, cols_paired: list) -> pd.DataFrame:
|
15 |
"""Creates a dataframe from all the individual experiment results"""
|
16 |
+
# raw_data = get_raw_eval_results(results_path, requests_path)
|
17 |
+
# all_data_json = [v.to_dict() for v in raw_data]
|
18 |
+
all_data_json = []
|
19 |
+
benchmark_cols = [col[0] for col in benchmark_cols_paired]
|
20 |
+
with open('./master_table.json') as f:
|
21 |
+
content = json.load(f)
|
22 |
+
for key, val in content.items():
|
23 |
+
val['eval_name'] = val['id']
|
24 |
+
del val['id']
|
25 |
+
|
26 |
+
if 'link' in val and val['link'].strip():
|
27 |
+
val['Algorithm'] = model_hyperlink(val['link'], val['name'])
|
28 |
+
else:
|
29 |
+
val['Algorithm'] = val['name']
|
30 |
+
|
31 |
+
del val['name']
|
32 |
+
|
33 |
+
# fill in the missing benchmark columns as 0
|
34 |
+
for display_name, benchmark in benchmark_cols_paired:
|
35 |
+
if benchmark not in val:
|
36 |
+
val[display_name] = 0
|
37 |
+
else:
|
38 |
+
val[display_name] = val[benchmark]
|
39 |
+
del val[benchmark]
|
40 |
+
|
41 |
+
# change all the keys to the display names
|
42 |
+
for display_name, col in cols_paired:
|
43 |
+
if display_name in val:
|
44 |
+
pass
|
45 |
+
elif col in val:
|
46 |
+
val[display_name] = val[col]
|
47 |
+
del val[col]
|
48 |
+
else:
|
49 |
+
val[display_name] = None
|
50 |
+
|
51 |
+
all_data_json.append(val)
|
52 |
+
|
53 |
+
print(f'All data json: {all_data_json}')
|
54 |
|
55 |
df = pd.DataFrame.from_records(all_data_json)
|
56 |
+
df[AutoEvalColumn.average.name] = df.apply(lambda row: calc_average(row, benchmark_cols), axis=1)
|
57 |
print(df, AutoEvalColumn.average.name)
|
58 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
59 |
+
df = df[cols].round(decimals=4)
|
60 |
|
61 |
# filter out if any of the benchmarks have not been produced
|
62 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
63 |
+
return df
|
64 |
|
65 |
|
66 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
src/submission/submit.py
CHANGED
@@ -2,6 +2,9 @@ import json
|
|
2 |
import os
|
3 |
import re
|
4 |
from datetime import datetime, timezone
|
|
|
|
|
|
|
5 |
|
6 |
from src.challenges.result_parsers import parse_challenge_result_dict
|
7 |
|
@@ -30,14 +33,16 @@ def add_new_eval(
|
|
30 |
algo_link: str,
|
31 |
submitter_email: str,
|
32 |
):
|
33 |
-
return_str = 'Success! Your submission will
|
34 |
|
35 |
# validate email and url
|
36 |
if not parseaddr(submitter_email):
|
37 |
return styled_error("Please enter a valid email address.")
|
38 |
|
|
|
|
|
39 |
if algo_link.strip() and not urlparse(algo_link).scheme:
|
40 |
-
return styled_error("Please enter a valid URL.")
|
41 |
|
42 |
# get file path
|
43 |
try:
|
@@ -91,6 +96,20 @@ def add_new_eval(
|
|
91 |
"update_timestamp": timestamp_filename,
|
92 |
}
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
for challenge, result in results_per_challenge.items():
|
95 |
try:
|
96 |
parsed_result: float = parse_challenge_result_dict(challenge, result)
|
@@ -103,8 +122,8 @@ def add_new_eval(
|
|
103 |
# Get content of the master table from DATA_REPO
|
104 |
try:
|
105 |
master_table = {}
|
106 |
-
if API.file_exists(DATA_REPO, "master_table.json"):
|
107 |
-
API.hf_hub_download(DATA_REPO, "master_table.json", EVAL_REQUESTS_PATH, force_download=True)
|
108 |
with open(f"{EVAL_REQUESTS_PATH}/master_table.json", "r") as f:
|
109 |
master_table = json.load(f)
|
110 |
else:
|
@@ -134,3 +153,82 @@ def add_new_eval(
|
|
134 |
)
|
135 |
|
136 |
return styled_message(return_str)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import os
|
3 |
import re
|
4 |
from datetime import datetime, timezone
|
5 |
+
import rsa
|
6 |
+
|
7 |
+
from src.envs import RSA_PUBKEY
|
8 |
|
9 |
from src.challenges.result_parsers import parse_challenge_result_dict
|
10 |
|
|
|
33 |
algo_link: str,
|
34 |
submitter_email: str,
|
35 |
):
|
36 |
+
return_str = 'Success! Your submission will be added to the leaderboard within 24 hours.'
|
37 |
|
38 |
# validate email and url
|
39 |
if not parseaddr(submitter_email):
|
40 |
return styled_error("Please enter a valid email address.")
|
41 |
|
42 |
+
submitter_email = rsa.encrypt(submitter_email.encode(), RSA_PUBKEY).hex()
|
43 |
+
|
44 |
if algo_link.strip() and not urlparse(algo_link).scheme:
|
45 |
+
return styled_error("Please enter a valid URL (including the http/https protocol).")
|
46 |
|
47 |
# get file path
|
48 |
try:
|
|
|
96 |
"update_timestamp": timestamp_filename,
|
97 |
}
|
98 |
|
99 |
+
# Upload the metadata file
|
100 |
+
print("Uploading metadata file")
|
101 |
+
metadata_filename = f'./tmp_metadata_{algo_name_filename}_{timestamp_filename}.json'
|
102 |
+
with open(metadata_filename, 'w') as f:
|
103 |
+
f.write(json.dumps(eval_entry))
|
104 |
+
|
105 |
+
API.upload_file(
|
106 |
+
path_or_fileobj=metadata_filename,
|
107 |
+
path_in_repo=f'upload_history/{algo_name_filename}/{timestamp_filename}_metadata.json',
|
108 |
+
repo_id=DATA_REPO,
|
109 |
+
repo_type="dataset",
|
110 |
+
commit_message=f"Add metadata {algo_name} by {submitter_email} at {timestamp_filename}",
|
111 |
+
)
|
112 |
+
|
113 |
for challenge, result in results_per_challenge.items():
|
114 |
try:
|
115 |
parsed_result: float = parse_challenge_result_dict(challenge, result)
|
|
|
122 |
# Get content of the master table from DATA_REPO
|
123 |
try:
|
124 |
master_table = {}
|
125 |
+
if API.file_exists(DATA_REPO, "master_table.json", repo_type='dataset'):
|
126 |
+
API.hf_hub_download(DATA_REPO, "master_table.json", local_dir=EVAL_REQUESTS_PATH, repo_type='dataset', force_download=True)
|
127 |
with open(f"{EVAL_REQUESTS_PATH}/master_table.json", "r") as f:
|
128 |
master_table = json.load(f)
|
129 |
else:
|
|
|
153 |
)
|
154 |
|
155 |
return styled_message(return_str)
|
156 |
+
|
157 |
+
|
158 |
+
|
159 |
+
def add_new_challenge(
|
160 |
+
submission_files,
|
161 |
+
challenge_name: str,
|
162 |
+
challenge_info: str,
|
163 |
+
challenge_link: str,
|
164 |
+
submitter_email: str,
|
165 |
+
):
|
166 |
+
return_str = 'Success! We are working to incorporate your submitted challenge into the leaderboard, and will get back to you when we encounter problems.'
|
167 |
+
|
168 |
+
# validate email and url
|
169 |
+
if not parseaddr(submitter_email):
|
170 |
+
return styled_error("Please enter a valid email address.")
|
171 |
+
|
172 |
+
submitter_email = rsa.encrypt(submitter_email.encode(), RSA_PUBKEY).hex()
|
173 |
+
|
174 |
+
if challenge_link.strip() and not urlparse(challenge_link).scheme:
|
175 |
+
return styled_error("Please enter a valid URL (including the http/https protocol).")
|
176 |
+
|
177 |
+
# get file path
|
178 |
+
if submission_files is None:
|
179 |
+
submission_files = []
|
180 |
+
else:
|
181 |
+
try:
|
182 |
+
assert isinstance(submission_files, list)
|
183 |
+
assert all(isinstance(file, str) for file in submission_files)
|
184 |
+
except:
|
185 |
+
return styled_error("Invalid submission file: File path not found.")
|
186 |
+
|
187 |
+
# format the challenge name
|
188 |
+
challenge_name = challenge_name.strip()
|
189 |
+
challenge_name_filename = re.sub(r"[^a-zA-Z0-9]+", "-", challenge_name).lower()
|
190 |
+
timestamp_filename = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S")
|
191 |
+
|
192 |
+
for num, file_path in enumerate(submission_files):
|
193 |
+
# parse the submission file
|
194 |
+
try:
|
195 |
+
with open(file_path, "r") as f:
|
196 |
+
submission_data = json.load(f)
|
197 |
+
except JSONDecodeError:
|
198 |
+
return styled_error(f"Invalid submission file {os.path.basename(file_path)}: JSON parsing failed.")
|
199 |
+
|
200 |
+
try:
|
201 |
+
assert isinstance(submission_data, dict)
|
202 |
+
assert all(isinstance(result, dict) for result in submission_data.values())
|
203 |
+
except (AssertionError, KeyError):
|
204 |
+
return styled_error(f"Invalid submission file {os.path.basename(file_path)}: Incorrect organization of the JSON file.")
|
205 |
+
|
206 |
+
print("Uploading submission file")
|
207 |
+
API.upload_file(
|
208 |
+
path_or_fileobj=file_path,
|
209 |
+
path_in_repo=f'upload_history/{challenge_name_filename}/{timestamp_filename}_file{num}_{os.path.basename(file_path)}.json',
|
210 |
+
repo_id=DATA_REPO,
|
211 |
+
repo_type="dataset",
|
212 |
+
commit_message=f"Add {challenge_name} to eval queue by {submitter_email} at {timestamp_filename}",
|
213 |
+
)
|
214 |
+
|
215 |
+
print("Uploading metadata file")
|
216 |
+
filename = f'./tmp_metadata_{challenge_name_filename}_{timestamp_filename}.json'
|
217 |
+
with open(filename, 'w') as f:
|
218 |
+
f.write(json.dumps({
|
219 |
+
"name": challenge_name,
|
220 |
+
"info": challenge_info,
|
221 |
+
"link": challenge_link,
|
222 |
+
"email": submitter_email,
|
223 |
+
"update_timestamp": timestamp_filename,
|
224 |
+
}))
|
225 |
+
|
226 |
+
API.upload_file(
|
227 |
+
path_or_fileobj=filename,
|
228 |
+
path_in_repo=f'upload_history/{challenge_name_filename}/{timestamp_filename}_metadata.json',
|
229 |
+
repo_id=DATA_REPO,
|
230 |
+
repo_type="dataset",
|
231 |
+
commit_message=f"Add metadata {challenge_name} by {submitter_email} at {timestamp_filename}",
|
232 |
+
)
|
233 |
+
|
234 |
+
return styled_message(return_str)
|
tempCodeRunnerFile.python
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import rsa
|
2 |
+
|
3 |
+
content = 'hello,world!@#$%^&*()_+'
|
4 |
+
|
5 |
+
with open('pubkey.pem', mode='rb') as f:
|
6 |
+
pubkey = rsa.PublicKey.load_pkcs1(f.read())
|
7 |
+
|
8 |
+
with open('privkey.pem', mode='rb') as f:
|
9 |
+
prikey = rsa.PrivateKey.load_pkcs1(f.read())
|
10 |
+
|
11 |
+
crypto = rsa.encrypt(content.encode(), pubkey)
|
12 |
+
print(crypto.hex())
|
13 |
+
|
14 |
+
crypto = bytes.fromhex('24cb974ad7d5673bfc9e7bca3ad66d2365f52ef5cb6cdbc72f1bb5877cf358fcffc3ed34682b205d8b4a8982e979e9ef7f6f2250220f271fcbb2f733ee68a511acb49edbd437fc2798b7eaf7d890c72a7c5ce98254940fa1f40c44deecacd57b97576570f30a3704ba31949cd37e6ed5e594827e524d5a03341293cf767e38af2d8dbe089d370f12454e3290c6071d861940611855dbcd62f7b5dda5af2afb91c8d7c8545242d5bb9b88f49feb969f6d18ad6eea9947e8ced1a0e0aae4b025eb0225753dca9e796652def27bff7d77b1a5c97cd95e9bf638d2e33cd21b0bb0e9d3e3b0fc490b181ed6933ea555792f86ac0ea438479f4d66f1404d93924eeaff05f2d533ba710e11d7018ecda1eb1490c60e23a92d855a0de2f1d811382bf6b49bafd69c507b84d2e1ab7816fcfdc7393d4207ebd7260ec9711e9a180b3263e16e590f7eb15b163f4ccc55147530aa8ba6c34272060d8befc19703c1d1e199cbca80fa47869d8f80a1053d227abd519f9ce17bb09d676fda6ca5c3e547231df9aafba2a8ced35ace7b79087129b373535c3057b6042484ace1d03d982605ea268d6c454f5096b54c528f8373fb4b90d2b7cdb658d7cbd31035efddbbc2e443d0a106d450392d503cb3f2082b4607d070aa090649bc443fe54e7c95ed6059b3aa1a42154169a09c3481ee863c41cf6a7ee2ce05670b5a5cce9259a41669b4e647')
|
15 |
+
|
16 |
+
content2 = rsa.decrypt(crypto, prikey).decode()
|
17 |
+
print(content2, len(content2), len(content2.encode()))
|