Spaces:
Runtime error
Runtime error
pminervini
commited on
Commit
·
d01d881
1
Parent(s):
e1f29ca
update
Browse files- app.py +2 -2
- beta-cli.py +3 -3
- src/display/utils.py +7 -6
- src/leaderboard/read_evals.py +4 -0
app.py
CHANGED
@@ -28,8 +28,8 @@ from src.display.utils import (
|
|
28 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
30 |
from src.submission.submit import add_new_eval
|
31 |
-
from src.submission.check_validity import already_submitted_models
|
32 |
-
from src.tools.collections import update_collections
|
33 |
from src.tools.plots import (
|
34 |
create_metric_plot_obj,
|
35 |
create_plot_df,
|
|
|
28 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
|
29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
30 |
from src.submission.submit import add_new_eval
|
31 |
+
# from src.submission.check_validity import already_submitted_models
|
32 |
+
# from src.tools.collections import update_collections
|
33 |
from src.tools.plots import (
|
34 |
create_metric_plot_obj,
|
35 |
create_plot_df,
|
beta-cli.py
CHANGED
@@ -10,7 +10,7 @@ snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="
|
|
10 |
raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
11 |
|
12 |
for entry in raw_data:
|
13 |
-
if '
|
14 |
-
|
15 |
|
16 |
-
# print(raw_data)
|
|
|
10 |
raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
|
11 |
|
12 |
for entry in raw_data:
|
13 |
+
# if '125m' in entry.eval_name:
|
14 |
+
print(entry)
|
15 |
|
16 |
+
# print(raw_data)
|
src/display/utils.py
CHANGED
@@ -18,9 +18,10 @@ class Tasks(Enum):
|
|
18 |
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
19 |
mmlu = Task("hendrycksTest", "acc", "MMLU")
|
20 |
truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
|
21 |
-
winogrande = Task("winogrande", "acc", "Winogrande")
|
22 |
-
gsm8k = Task("gsm8k", "acc", "GSM8K")
|
23 |
drop = Task("drop", "f1", "DROP")
|
|
|
24 |
|
25 |
# These classes are for user facing column names,
|
26 |
# to avoid having to change them all around the code
|
@@ -77,8 +78,8 @@ baseline_row = {
|
|
77 |
AutoEvalColumn.hellaswag.name: 25.0,
|
78 |
AutoEvalColumn.mmlu.name: 25.0,
|
79 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
80 |
-
AutoEvalColumn.winogrande.name: 50.0,
|
81 |
-
AutoEvalColumn.gsm8k.name: 0.21,
|
82 |
AutoEvalColumn.drop.name: 0.47,
|
83 |
AutoEvalColumn.dummy.name: "baseline",
|
84 |
AutoEvalColumn.model_type.name: "",
|
@@ -102,8 +103,8 @@ human_baseline_row = {
|
|
102 |
AutoEvalColumn.hellaswag.name: 95.0,
|
103 |
AutoEvalColumn.mmlu.name: 89.8,
|
104 |
AutoEvalColumn.truthfulqa.name: 94.0,
|
105 |
-
AutoEvalColumn.winogrande.name: 94.0,
|
106 |
-
AutoEvalColumn.gsm8k.name: 100,
|
107 |
AutoEvalColumn.drop.name: 96.42,
|
108 |
AutoEvalColumn.dummy.name: "human_baseline",
|
109 |
AutoEvalColumn.model_type.name: "",
|
|
|
18 |
hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
|
19 |
mmlu = Task("hendrycksTest", "acc", "MMLU")
|
20 |
truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
|
21 |
+
# winogrande = Task("winogrande", "acc", "Winogrande")
|
22 |
+
# gsm8k = Task("gsm8k", "acc", "GSM8K")
|
23 |
drop = Task("drop", "f1", "DROP")
|
24 |
+
nqopen = Task("nq_open", "em", "NQ Open")
|
25 |
|
26 |
# These classes are for user facing column names,
|
27 |
# to avoid having to change them all around the code
|
|
|
78 |
AutoEvalColumn.hellaswag.name: 25.0,
|
79 |
AutoEvalColumn.mmlu.name: 25.0,
|
80 |
AutoEvalColumn.truthfulqa.name: 25.0,
|
81 |
+
# AutoEvalColumn.winogrande.name: 50.0,
|
82 |
+
# AutoEvalColumn.gsm8k.name: 0.21,
|
83 |
AutoEvalColumn.drop.name: 0.47,
|
84 |
AutoEvalColumn.dummy.name: "baseline",
|
85 |
AutoEvalColumn.model_type.name: "",
|
|
|
103 |
AutoEvalColumn.hellaswag.name: 95.0,
|
104 |
AutoEvalColumn.mmlu.name: 89.8,
|
105 |
AutoEvalColumn.truthfulqa.name: 94.0,
|
106 |
+
# AutoEvalColumn.winogrande.name: 94.0,
|
107 |
+
# AutoEvalColumn.gsm8k.name: 100,
|
108 |
AutoEvalColumn.drop.name: 96.42,
|
109 |
AutoEvalColumn.dummy.name: "human_baseline",
|
110 |
AutoEvalColumn.model_type.name: "",
|
src/leaderboard/read_evals.py
CHANGED
@@ -96,6 +96,10 @@ class EvalResult:
|
|
96 |
mean_acc = np.mean(accs) * 100.0
|
97 |
results[task.benchmark] = mean_acc
|
98 |
|
|
|
|
|
|
|
|
|
99 |
return self(
|
100 |
eval_name=result_key,
|
101 |
full_model=full_model,
|
|
|
96 |
mean_acc = np.mean(accs) * 100.0
|
97 |
results[task.benchmark] = mean_acc
|
98 |
|
99 |
+
# XXX
|
100 |
+
if 'nq_open' not in results:
|
101 |
+
results['nq_open'] = 0.0
|
102 |
+
|
103 |
return self(
|
104 |
eval_name=result_key,
|
105 |
full_model=full_model,
|