Add Solbench score
Browse files- src/display/utils.py +2 -1
- src/leaderboard/read_evals.py +7 -2
- src/populate.py +2 -1
src/display/utils.py
CHANGED
@@ -38,7 +38,8 @@ auto_eval_column_dict = [
|
|
38 |
"", "str", True, never_hidden=True)),
|
39 |
("model", ColumnContent, create_column_content(
|
40 |
"Model", "markdown", True, never_hidden=True)),
|
41 |
-
("
|
|
|
42 |
]
|
43 |
|
44 |
# Add task-specific columns
|
|
|
38 |
"", "str", True, never_hidden=True)),
|
39 |
("model", ColumnContent, create_column_content(
|
40 |
"Model", "markdown", True, never_hidden=True)),
|
41 |
+
("solbench", ColumnContent, create_column_content("Score", "number", True)),
|
42 |
+
# ("average", ColumnContent, create_column_content("Average", "number", True)),
|
43 |
]
|
44 |
|
45 |
# Add task-specific columns
|
src/leaderboard/read_evals.py
CHANGED
@@ -114,7 +114,11 @@ class EvalResult:
|
|
114 |
|
115 |
def to_dict(self):
|
116 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
117 |
-
average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
|
|
|
|
|
|
|
|
118 |
data_dict = {
|
119 |
"eval_name": self.eval_name, # not a column, just a save name,
|
120 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
@@ -124,7 +128,8 @@ class EvalResult:
|
|
124 |
AutoEvalColumn.architecture.name: self.architecture,
|
125 |
AutoEvalColumn.model.name: make_clickable_model(self.model_name),
|
126 |
AutoEvalColumn.revision.name: self.revision,
|
127 |
-
AutoEvalColumn.average.name: average,
|
|
|
128 |
AutoEvalColumn.license.name: self.license,
|
129 |
AutoEvalColumn.likes.name: self.likes,
|
130 |
AutoEvalColumn.params.name: self.num_params,
|
|
|
114 |
|
115 |
def to_dict(self):
|
116 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
117 |
+
# average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
|
118 |
+
solbench = sum([
|
119 |
+
self.results.get('naive_judge', 0),
|
120 |
+
self.results.get('human_eval_solidity', 0)
|
121 |
+
]) / 2
|
122 |
data_dict = {
|
123 |
"eval_name": self.eval_name, # not a column, just a save name,
|
124 |
AutoEvalColumn.precision.name: self.precision.value.name,
|
|
|
128 |
AutoEvalColumn.architecture.name: self.architecture,
|
129 |
AutoEvalColumn.model.name: make_clickable_model(self.model_name),
|
130 |
AutoEvalColumn.revision.name: self.revision,
|
131 |
+
# AutoEvalColumn.average.name: average,
|
132 |
+
AutoEvalColumn.solbench.name: solbench,
|
133 |
AutoEvalColumn.license.name: self.license,
|
134 |
AutoEvalColumn.likes.name: self.likes,
|
135 |
AutoEvalColumn.params.name: self.num_params,
|
src/populate.py
CHANGED
@@ -19,7 +19,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
19 |
all_data_json = [v.to_dict() for v in raw_data]
|
20 |
|
21 |
df = pd.DataFrame.from_records(all_data_json)
|
22 |
-
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
23 |
df = df[cols].round(decimals=2)
|
24 |
|
25 |
# filter out if any of the benchmarks have not been produced
|
|
|
19 |
all_data_json = [v.to_dict() for v in raw_data]
|
20 |
|
21 |
df = pd.DataFrame.from_records(all_data_json)
|
22 |
+
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
23 |
+
df = df.sort_values(by=[AutoEvalColumn.solbench.name], ascending=False)
|
24 |
df = df[cols].round(decimals=2)
|
25 |
|
26 |
# filter out if any of the benchmarks have not been produced
|