Spaces:
Running
Running
Added result screener + UI format
Browse files- app.py +14 -14
- src/about.py +1 -1
- src/display/utils.py +18 -11
- src/leaderboard/read_evals.py +25 -25
- src/populate.py +16 -9
- src/submission/submit.py +10 -8
app.py
CHANGED
@@ -68,21 +68,21 @@ def init_leaderboard(dataframe):
|
|
68 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
label="Select Columns to Display:",
|
70 |
),
|
71 |
-
search_columns=[AutoEvalColumn.model.name
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
-
ColumnFilter(
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
),
|
83 |
-
ColumnFilter(
|
84 |
-
|
85 |
-
),
|
86 |
],
|
87 |
bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
@@ -172,7 +172,7 @@ with demo:
|
|
172 |
interactive=True,
|
173 |
)
|
174 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
-
ans_file = gr.File(label="Arena Hard Answer File", file_types=["
|
176 |
|
177 |
submit_button = gr.Button("Submit Eval")
|
178 |
submission_result = gr.Markdown()
|
|
|
68 |
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
|
69 |
label="Select Columns to Display:",
|
70 |
),
|
71 |
+
search_columns=[AutoEvalColumn.model.name],
|
72 |
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
73 |
filter_columns=[
|
74 |
+
# ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
+
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
+
# ColumnFilter(
|
77 |
+
# AutoEvalColumn.params.name,
|
78 |
+
# type="slider",
|
79 |
+
# min=0.01,
|
80 |
+
# max=150,
|
81 |
+
# label="Select the number of parameters (B)",
|
82 |
+
# ),
|
83 |
+
# ColumnFilter(
|
84 |
+
# AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
+
# ),
|
86 |
],
|
87 |
bool_checkboxgroup_label="Hide models",
|
88 |
interactive=False,
|
|
|
172 |
interactive=True,
|
173 |
)
|
174 |
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
+
ans_file = gr.File(label="Arena Hard Answer File", file_types=["json","jsonl"])
|
176 |
|
177 |
submit_button = gr.Button("Submit Eval")
|
178 |
submission_result = gr.Markdown()
|
src/about.py
CHANGED
@@ -13,7 +13,7 @@ class Task:
|
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
task0 = Task("arenahard", "score", "score")
|
16 |
-
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
|
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
task0 = Task("arenahard", "score", "score")
|
16 |
+
# task1 = Task("logiqa", "acc_norm", "LogiQA")
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
src/display/utils.py
CHANGED
@@ -22,23 +22,29 @@ class ColumnContent:
|
|
22 |
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
|
|
25 |
# Init
|
26 |
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
27 |
-
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("
|
28 |
#Scores
|
29 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
-
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
34 |
-
auto_eval_column_dict.append(["
|
35 |
-
auto_eval_column_dict.append(["
|
36 |
-
auto_eval_column_dict.append(["
|
37 |
-
auto_eval_column_dict.append(["
|
38 |
-
auto_eval_column_dict.append(["
|
39 |
-
auto_eval_column_dict.append(["
|
40 |
-
auto_eval_column_dict.append(["
|
41 |
-
auto_eval_column_dict.append(["
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
@@ -52,6 +58,7 @@ class EvalQueueColumn: # Queue column
|
|
52 |
precision = ColumnContent("precision", "str", True)
|
53 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
54 |
status = ColumnContent("status", "str", True)
|
|
|
55 |
|
56 |
## All the model information that we might need
|
57 |
@dataclass
|
|
|
22 |
|
23 |
## Leaderboard columns
|
24 |
auto_eval_column_dict = []
|
25 |
+
# ['results', 'model', 'score', 'lower', 'upper', 'avg_tokens', 'std_tokens', 'lc_score']
|
26 |
# Init
|
27 |
# auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
28 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("model", "markdown", True, never_hidden=True)])
|
29 |
#Scores
|
30 |
+
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
# Model information
|
34 |
+
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
35 |
+
auto_eval_column_dict.append(["lower", ColumnContent, ColumnContent("lower", "number", True)])
|
36 |
+
auto_eval_column_dict.append(["upper", ColumnContent, ColumnContent("upper", "number", True)])
|
37 |
+
auto_eval_column_dict.append(["avg_tokens", ColumnContent, ColumnContent("avg_tokens", "number", True)])
|
38 |
+
auto_eval_column_dict.append(["std_tokens", ColumnContent, ColumnContent("std_tokens", "number", True)])
|
39 |
+
auto_eval_column_dict.append(["lc_score", ColumnContent, ColumnContent("lc_score", "number", True)])
|
40 |
+
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
41 |
+
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
42 |
+
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
43 |
+
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
44 |
+
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
45 |
+
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
46 |
+
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
47 |
+
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
48 |
|
49 |
# We use make dataclass to dynamically fill the scores from Tasks
|
50 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
|
|
58 |
precision = ColumnContent("precision", "str", True)
|
59 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
60 |
status = ColumnContent("status", "str", True)
|
61 |
+
answers_file = ColumnContent("answers_file", "str", True)
|
62 |
|
63 |
## All the model information that we might need
|
64 |
@dataclass
|
src/leaderboard/read_evals.py
CHANGED
@@ -18,14 +18,14 @@ class EvalResult:
|
|
18 |
"""
|
19 |
eval_name: str # org_model_precision (uid)
|
20 |
full_model: str # org/model (path on hub)
|
21 |
-
org: str
|
22 |
model: str
|
23 |
revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
precision: Precision = Precision.Unknown
|
26 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
-
architecture: str = "Unknown"
|
29 |
license: str = "?"
|
30 |
likes: int = 0
|
31 |
num_params: int = 0
|
@@ -85,7 +85,7 @@ class EvalResult:
|
|
85 |
org=org,
|
86 |
model=model,
|
87 |
results=results,
|
88 |
-
precision=precision,
|
89 |
revision= config.get("model_sha", ""),
|
90 |
still_on_hub=still_on_hub,
|
91 |
architecture=architecture
|
@@ -172,25 +172,25 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
172 |
for file in files:
|
173 |
model_result_filepaths.append(os.path.join(root, file))
|
174 |
|
175 |
-
eval_results = {}
|
176 |
-
for model_result_filepath in model_result_filepaths:
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
results = []
|
189 |
-
for v in eval_results.values():
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
return results
|
|
|
18 |
"""
|
19 |
eval_name: str # org_model_precision (uid)
|
20 |
full_model: str # org/model (path on hub)
|
21 |
+
org: str
|
22 |
model: str
|
23 |
revision: str # commit hash, "" if main
|
24 |
results: dict
|
25 |
precision: Precision = Precision.Unknown
|
26 |
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
27 |
weight_type: WeightType = WeightType.Original # Original or Adapter
|
28 |
+
architecture: str = "Unknown"
|
29 |
license: str = "?"
|
30 |
likes: int = 0
|
31 |
num_params: int = 0
|
|
|
85 |
org=org,
|
86 |
model=model,
|
87 |
results=results,
|
88 |
+
precision=precision,
|
89 |
revision= config.get("model_sha", ""),
|
90 |
still_on_hub=still_on_hub,
|
91 |
architecture=architecture
|
|
|
172 |
for file in files:
|
173 |
model_result_filepaths.append(os.path.join(root, file))
|
174 |
|
175 |
+
# eval_results = {}
|
176 |
+
# for model_result_filepath in model_result_filepaths:
|
177 |
+
# # Creation of result
|
178 |
+
# eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
+
# eval_result.update_with_request_file(requests_path)
|
180 |
+
|
181 |
+
# # Store results of same eval together
|
182 |
+
# eval_name = eval_result.eval_name
|
183 |
+
# if eval_name in eval_results.keys():
|
184 |
+
# eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
185 |
+
# else:
|
186 |
+
# eval_results[eval_name] = eval_result
|
187 |
+
|
188 |
+
# results = []
|
189 |
+
# for v in eval_results.values():
|
190 |
+
# try:
|
191 |
+
# v.to_dict() # we test if the dict version is complete
|
192 |
+
# results.append(v)
|
193 |
+
# except KeyError: # not all eval values present
|
194 |
+
# continue
|
195 |
+
|
196 |
+
return model_result_filepaths#results
|
src/populate.py
CHANGED
@@ -6,19 +6,22 @@ import pandas as pd
|
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
|
|
9 |
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
-
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
-
|
16 |
-
df = pd.DataFrame.from_records(
|
17 |
-
df
|
|
|
18 |
# df.columns = cols
|
19 |
# df.iloc[0]= create dummy
|
20 |
-
#
|
21 |
-
|
|
|
22 |
|
23 |
# filter out if any of the benchmarks have not been produced
|
24 |
# df = df[has_no_nan_values(df, benchmark_cols)]
|
@@ -27,25 +30,29 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
27 |
|
28 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
29 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
30 |
-
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
31 |
all_evals = []
|
32 |
|
33 |
for entry in entries:
|
34 |
-
if ".json" in entry:
|
35 |
file_path = os.path.join(save_path, entry)
|
36 |
with open(file_path) as fp:
|
|
|
37 |
data = json.load(fp)
|
38 |
|
39 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
40 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
41 |
|
42 |
all_evals.append(data)
|
43 |
-
elif ".md" not in entry:
|
44 |
# this is a folder
|
45 |
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
46 |
for sub_entry in sub_entries:
|
|
|
|
|
47 |
file_path = os.path.join(save_path, entry, sub_entry)
|
48 |
with open(file_path) as fp:
|
|
|
49 |
data = json.load(fp)
|
50 |
|
51 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
|
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
+
from src.envs import RESULTS_REPO
|
10 |
|
11 |
|
12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
+
# all_data_json = [v.to_dict() for v in raw_data]
|
16 |
+
# print(raw_data)
|
17 |
+
df = pd.DataFrame.from_records(json.load(open(raw_data[0])))
|
18 |
+
print(list(df.columns))
|
19 |
+
# df['model']="nothing"
|
20 |
# df.columns = cols
|
21 |
# df.iloc[0]= create dummy
|
22 |
+
# print(dir(AutoEvalColumn))
|
23 |
+
df = df.sort_values(by=[AutoEvalColumn.task0.name], ascending=False)
|
24 |
+
df = df[cols].round(decimals=2)
|
25 |
|
26 |
# filter out if any of the benchmarks have not been produced
|
27 |
# df = df[has_no_nan_values(df, benchmark_cols)]
|
|
|
30 |
|
31 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
32 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
33 |
+
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".") and not entry.endswith(".jsonl")]
|
34 |
all_evals = []
|
35 |
|
36 |
for entry in entries:
|
37 |
+
if ".json" in entry and 'toeval' not in entry:
|
38 |
file_path = os.path.join(save_path, entry)
|
39 |
with open(file_path) as fp:
|
40 |
+
print(file_path)
|
41 |
data = json.load(fp)
|
42 |
|
43 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
44 |
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
45 |
|
46 |
all_evals.append(data)
|
47 |
+
elif ".md" not in entry and 'toeval' not in entry and 'results' not in entry:
|
48 |
# this is a folder
|
49 |
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
50 |
for sub_entry in sub_entries:
|
51 |
+
if 'toeval' in sub_entry:
|
52 |
+
continue
|
53 |
file_path = os.path.join(save_path, entry, sub_entry)
|
54 |
with open(file_path) as fp:
|
55 |
+
# print(file_path)
|
56 |
data = json.load(fp)
|
57 |
|
58 |
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
src/submission/submit.py
CHANGED
@@ -16,7 +16,7 @@ USERS_TO_SUBMISSION_DATES = None
|
|
16 |
|
17 |
def add_new_eval(
|
18 |
model: str,
|
19 |
-
|
20 |
revision: str,
|
21 |
precision: str,
|
22 |
weight_type: str,
|
@@ -28,7 +28,7 @@ def add_new_eval(
|
|
28 |
if not REQUESTED_MODELS:
|
29 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
30 |
|
31 |
-
user_name =
|
32 |
model_path = model
|
33 |
if "/" in model:
|
34 |
user_name = model.split("/")[0]
|
@@ -75,10 +75,15 @@ def add_new_eval(
|
|
75 |
|
76 |
# Seems good, creating the eval
|
77 |
print("Adding new eval")
|
|
|
|
|
|
|
|
|
|
|
78 |
|
79 |
eval_entry = {
|
80 |
"model": model,
|
81 |
-
"
|
82 |
"revision": revision,
|
83 |
"precision": precision,
|
84 |
"weight_type": weight_type,
|
@@ -87,8 +92,9 @@ def add_new_eval(
|
|
87 |
"model_type": model_type,
|
88 |
"likes": "",
|
89 |
"params": "",
|
90 |
-
"license":
|
91 |
"private": False,
|
|
|
92 |
}
|
93 |
|
94 |
# Check for duplicate submission
|
@@ -96,10 +102,6 @@ def add_new_eval(
|
|
96 |
return styled_warning("This model has been already submitted.")
|
97 |
|
98 |
print("Creating eval file")
|
99 |
-
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
100 |
-
os.makedirs(OUT_DIR, exist_ok=True)
|
101 |
-
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
102 |
-
out_path_upload = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}_toeval.json"
|
103 |
|
104 |
with open(out_path, "w") as f:
|
105 |
f.write(json.dumps(eval_entry))
|
|
|
16 |
|
17 |
def add_new_eval(
|
18 |
model: str,
|
19 |
+
user_name: str,
|
20 |
revision: str,
|
21 |
precision: str,
|
22 |
weight_type: str,
|
|
|
28 |
if not REQUESTED_MODELS:
|
29 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
30 |
|
31 |
+
user_name = user_name
|
32 |
model_path = model
|
33 |
if "/" in model:
|
34 |
user_name = model.split("/")[0]
|
|
|
75 |
|
76 |
# Seems good, creating the eval
|
77 |
print("Adding new eval")
|
78 |
+
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
79 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
80 |
+
out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
|
81 |
+
out_path_upload = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}_toeval.json"
|
82 |
+
|
83 |
|
84 |
eval_entry = {
|
85 |
"model": model,
|
86 |
+
"user_name": user_name,
|
87 |
"revision": revision,
|
88 |
"precision": precision,
|
89 |
"weight_type": weight_type,
|
|
|
92 |
"model_type": model_type,
|
93 |
"likes": "",
|
94 |
"params": "",
|
95 |
+
"license": "",
|
96 |
"private": False,
|
97 |
+
"answers_file": str(out_path_upload),
|
98 |
}
|
99 |
|
100 |
# Check for duplicate submission
|
|
|
102 |
return styled_warning("This model has been already submitted.")
|
103 |
|
104 |
print("Creating eval file")
|
|
|
|
|
|
|
|
|
105 |
|
106 |
with open(out_path, "w") as f:
|
107 |
f.write(json.dumps(eval_entry))
|