Spaces:
Running
Running
more ref, ci, debug
Browse files- app.py +41 -40
- src/display/utils.py +3 -2
- src/leaderboard/read_evals.py +3 -2
- src/populate.py +7 -0
app.py
CHANGED
@@ -60,6 +60,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
60 |
def init_leaderboard(dataframe):
|
61 |
# if dataframe is None or dataframe.empty:
|
62 |
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
63 |
return Leaderboard(
|
64 |
value=dataframe,
|
65 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
@@ -102,45 +103,45 @@ with demo:
|
|
102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
|
104 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
-
with gr.Column():
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
with gr.Column():
|
110 |
-
with gr.Accordion(
|
111 |
-
|
112 |
-
|
113 |
-
):
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
with gr.Accordion(
|
122 |
-
|
123 |
-
|
124 |
-
):
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
with gr.Accordion(
|
134 |
-
|
135 |
-
|
136 |
-
):
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
with gr.Row():
|
145 |
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
146 |
|
@@ -171,7 +172,7 @@ with demo:
|
|
171 |
value="Original",
|
172 |
interactive=True,
|
173 |
)
|
174 |
-
base_model_name_textbox = gr.Textbox(label="
|
175 |
ans_file = gr.File(label="Arena Hard Answer File", file_types=["json","jsonl"])
|
176 |
|
177 |
submit_button = gr.Button("Submit Eval")
|
|
|
60 |
def init_leaderboard(dataframe):
|
61 |
# if dataframe is None or dataframe.empty:
|
62 |
# raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
+
# print(dataframe.columns)
|
64 |
return Leaderboard(
|
65 |
value=dataframe,
|
66 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
|
|
103 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
104 |
|
105 |
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
106 |
+
# with gr.Column():
|
107 |
+
# with gr.Row():
|
108 |
+
# gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
109 |
+
|
110 |
+
# with gr.Column():
|
111 |
+
# with gr.Accordion(
|
112 |
+
# f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
113 |
+
# open=False,
|
114 |
+
# ):
|
115 |
+
# with gr.Row():
|
116 |
+
# finished_eval_table = gr.components.Dataframe(
|
117 |
+
# value=finished_eval_queue_df,
|
118 |
+
# headers=EVAL_COLS,
|
119 |
+
# datatype=EVAL_TYPES,
|
120 |
+
# row_count=5,
|
121 |
+
# )
|
122 |
+
# with gr.Accordion(
|
123 |
+
# f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
124 |
+
# open=False,
|
125 |
+
# ):
|
126 |
+
# with gr.Row():
|
127 |
+
# running_eval_table = gr.components.Dataframe(
|
128 |
+
# value=running_eval_queue_df,
|
129 |
+
# headers=EVAL_COLS,
|
130 |
+
# datatype=EVAL_TYPES,
|
131 |
+
# row_count=5,
|
132 |
+
# )
|
133 |
+
|
134 |
+
# with gr.Accordion(
|
135 |
+
# f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
136 |
+
# open=False,
|
137 |
+
# ):
|
138 |
+
# with gr.Row():
|
139 |
+
# pending_eval_table = gr.components.Dataframe(
|
140 |
+
# value=pending_eval_queue_df,
|
141 |
+
# headers=EVAL_COLS,
|
142 |
+
# datatype=EVAL_TYPES,
|
143 |
+
# row_count=5,
|
144 |
+
# )
|
145 |
with gr.Row():
|
146 |
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
147 |
|
|
|
172 |
value="Original",
|
173 |
interactive=True,
|
174 |
)
|
175 |
+
base_model_name_textbox = gr.Textbox(label="Организация")
|
176 |
ans_file = gr.File(label="Arena Hard Answer File", file_types=["json","jsonl"])
|
177 |
|
178 |
submit_button = gr.Button("Submit Eval")
|
src/display/utils.py
CHANGED
@@ -32,8 +32,9 @@ for task in Tasks:
|
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
# Model information
|
34 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
35 |
-
auto_eval_column_dict.append(["
|
36 |
-
auto_eval_column_dict.append(["
|
|
|
37 |
auto_eval_column_dict.append(["avg_tokens", ColumnContent, ColumnContent("avg_tokens", "number", True)])
|
38 |
auto_eval_column_dict.append(["std_tokens", ColumnContent, ColumnContent("std_tokens", "number", True)])
|
39 |
auto_eval_column_dict.append(["lc_score", ColumnContent, ColumnContent("lc_score", "number", True)])
|
|
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
# Model information
|
34 |
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
35 |
+
auto_eval_column_dict.append(["CI", ColumnContent, ColumnContent("95% CI", "string", True)])
|
36 |
+
auto_eval_column_dict.append(["lower", ColumnContent, ColumnContent("lower", "number", False)])
|
37 |
+
auto_eval_column_dict.append(["upper", ColumnContent, ColumnContent("upper", "number", False)])
|
38 |
auto_eval_column_dict.append(["avg_tokens", ColumnContent, ColumnContent("avg_tokens", "number", True)])
|
39 |
auto_eval_column_dict.append(["std_tokens", ColumnContent, ColumnContent("std_tokens", "number", True)])
|
40 |
auto_eval_column_dict.append(["lc_score", ColumnContent, ColumnContent("lc_score", "number", True)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -167,9 +167,10 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
167 |
try:
|
168 |
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
169 |
except dateutil.parser._parser.ParserError:
|
170 |
-
files = [files[-1]]
|
171 |
|
172 |
-
|
|
|
173 |
model_result_filepaths.append(os.path.join(root, file))
|
174 |
|
175 |
# eval_results = {}
|
|
|
167 |
try:
|
168 |
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
169 |
except dateutil.parser._parser.ParserError:
|
170 |
+
files = [sorted(files)[-1]]
|
171 |
|
172 |
+
|
173 |
+
for file in [files[-1]]:
|
174 |
model_result_filepaths.append(os.path.join(root, file))
|
175 |
|
176 |
# eval_results = {}
|
src/populate.py
CHANGED
@@ -16,11 +16,18 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
16 |
# print(raw_data)
|
17 |
df = pd.DataFrame.from_records(json.load(open(raw_data[0])))
|
18 |
print(list(df.columns))
|
|
|
19 |
# df['model']="nothing"
|
20 |
# df.columns = cols
|
21 |
# df.iloc[0]= create dummy
|
22 |
# print(dir(AutoEvalColumn))
|
23 |
df = df.sort_values(by=[AutoEvalColumn.task0.name], ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
df = df[cols].round(decimals=2)
|
25 |
|
26 |
# filter out if any of the benchmarks have not been produced
|
|
|
16 |
# print(raw_data)
|
17 |
df = pd.DataFrame.from_records(json.load(open(raw_data[0])))
|
18 |
print(list(df.columns))
|
19 |
+
df['95% CI'] = " "
|
20 |
# df['model']="nothing"
|
21 |
# df.columns = cols
|
22 |
# df.iloc[0]= create dummy
|
23 |
# print(dir(AutoEvalColumn))
|
24 |
df = df.sort_values(by=[AutoEvalColumn.task0.name], ascending=False)
|
25 |
+
decimal = 1
|
26 |
+
for i,row in df.iterrows():
|
27 |
+
if 'lower' not in row:
|
28 |
+
continue
|
29 |
+
interval = '+'+str(round(row['upper'] - row['score'], decimal))+' / '+str(round(row['lower'] - row['score'], decimal))
|
30 |
+
df.at[i,'95% CI'] = interval
|
31 |
df = df[cols].round(decimals=2)
|
32 |
|
33 |
# filter out if any of the benchmarks have not been produced
|