File size: 11,831 Bytes
e2e6875 25557b5 05c90f4 6679087 25557b5 7379857 38f4369 ddc25db 7379857 38f4369 25557b5 c660995 25557b5 3caeacd 05c90f4 c660995 25557b5 05c90f4 023a289 3caeacd 05c90f4 6679087 e29ab28 6679087 99aea78 7e19f96 81f1dd1 0d4c659 6679087 0d4c659 6679087 0d4c659 6679087 0d4c659 6679087 a0691fa 0d4c659 81f1dd1 6679087 3782698 5b4c5f8 25557b5 54202cb 3caeacd 7379857 c8b695a 7379857 c8b695a 7379857 99aea78 eec78c0 7379857 eec78c0 7379857 bd858f5 e611814 023a289 e611814 023a289 6679087 e611814 023a289 6679087 e611814 2436603 6679087 05c90f4 3caeacd 54202cb 3caeacd 7379857 3caeacd 7379857 3caeacd 7379857 c8b695a bd858f5 7379857 25557b5 3caeacd 99aea78 5b4c5f8 3caeacd 5b4c5f8 ddc25db 8f68cc2 6679087 7e19f96 6679087 7379857 54202cb 7379857 3caeacd c8b695a 7379857 99aea78 bd858f5 7379857 0d84f54 7379857 eec78c0 7379857 e611814 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 |
import io
import json
import gradio as gr
import pandas as pd
from huggingface_hub import HfFileSystem
RESULTS_DATASET_ID = "datasets/open-llm-leaderboard/results"
EXCLUDED_KEYS = {
"pretty_env_info",
"chat_template",
"group_subtasks",
}
# EXCLUDED_RESULTS_KEYS = {
# "leaderboard",
# }
# EXCLUDED_RESULTS_LEADERBOARDS_KEYS = {
# "alias",
# }
DETAILS_DATASET_ID = "datasets/open-llm-leaderboard/{model_name_sanitized}-details"
DETAILS_FILENAME = "samples_{subtask}_*.json"
TASKS = {
"leaderboard_arc_challenge": ("ARC", "leaderboard_arc_challenge"),
"leaderboard_bbh": ("BBH", "leaderboard_bbh"),
"leaderboard_gpqa": ("GPQA", "leaderboard_gpqa"),
"leaderboard_ifeval": ("IFEval", "leaderboard_ifeval"),
"leaderboard_math_hard": ("MATH", "leaderboard_math"),
"leaderboard_mmlu_pro": ("MMLU-Pro", "leaderboard_mmlu_pro"),
"leaderboard_musr": ("MuSR", "leaderboard_musr"),
}
SUBTASKS = {
"leaderboard_arc_challenge": ["leaderboard_arc_challenge"],
"leaderboard_bbh": [
"leaderboard_bbh_boolean_expressions",
"leaderboard_bbh_causal_judgement",
"leaderboard_bbh_date_understanding",
"leaderboard_bbh_disambiguation_qa",
"leaderboard_bbh_formal_fallacies",
"leaderboard_bbh_geometric_shapes",
"leaderboard_bbh_hyperbaton",
"leaderboard_bbh_logical_deduction_five_objects",
"leaderboard_bbh_logical_deduction_seven_objects",
"leaderboard_bbh_logical_deduction_three_objects",
"leaderboard_bbh_movie_recommendation",
"leaderboard_bbh_navigate",
"leaderboard_bbh_object_counting",
"leaderboard_bbh_penguins_in_a_table",
"leaderboard_bbh_reasoning_about_colored_objects",
"leaderboard_bbh_ruin_names",
"leaderboard_bbh_salient_translation_error_detection",
"leaderboard_bbh_snarks", "leaderboard_bbh_sports_understanding",
"leaderboard_bbh_temporal_sequences",
"leaderboard_bbh_tracking_shuffled_objects_five_objects",
"leaderboard_bbh_tracking_shuffled_objects_seven_objects",
"leaderboard_bbh_tracking_shuffled_objects_three_objects",
"leaderboard_bbh_web_of_lies",
],
"leaderboard_gpqa": [
"leaderboard_gpqa_extended",
"leaderboard_gpqa_diamond",
"leaderboard_gpqa_main",
],
"leaderboard_ifeval": ["leaderboard_ifeval"],
# "leaderboard_math_hard": [
"leaderboard_math": [
"leaderboard_math_algebra_hard",
"leaderboard_math_counting_and_prob_hard",
"leaderboard_math_geometry_hard",
"leaderboard_math_intermediate_algebra_hard",
"leaderboard_math_num_theory_hard",
"leaderboard_math_prealgebra_hard",
"leaderboard_math_precalculus_hard",
],
"leaderboard_mmlu_pro": ["leaderboard_mmlu_pro"],
"leaderboard_musr": [
"leaderboard_musr_murder_mysteries",
"leaderboard_musr_object_placements",
"leaderboard_musr_team_allocation",
],
}
fs = HfFileSystem()
def fetch_result_paths():
paths = fs.glob(f"{RESULTS_DATASET_ID}/**/**/*.json")
return paths
def filter_latest_result_path_per_model(paths):
from collections import defaultdict
d = defaultdict(list)
for path in paths:
model_id, _ = path[len(RESULTS_DATASET_ID) +1:].rsplit("/", 1)
d[model_id].append(path)
return {model_id: max(paths) for model_id, paths in d.items()}
def get_result_path_from_model(model_id, result_path_per_model):
return result_path_per_model[model_id]
def update_load_results_component():
return gr.Button("Load Results", interactive=True)
def load_data(result_path) -> pd.DataFrame:
with fs.open(result_path, "r") as f:
data = json.load(f)
return data
def load_results_dataframe(model_id):
if not model_id:
return
result_path = get_result_path_from_model(model_id, latest_result_path_per_model)
data = load_data(result_path)
model_name = data.get("model_name", "Model")
df = pd.json_normalize([{key: value for key, value in data.items() if key not in EXCLUDED_KEYS}])
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
return df.set_index(pd.Index([model_name])).reset_index()
def load_results_dataframes(*model_ids):
return [load_results_dataframe(model_id) for model_id in model_ids]
def display_results(task, *dfs):
dfs = [df.set_index("index") for df in dfs if "index" in df.columns]
if not dfs:
return None, None
df = pd.concat(dfs)
df = df.T.rename_axis(columns=None)
return display_tab("results", df, task), display_tab("configs", df, task)
def display_tab(tab, df, task):
df = df.style.format(na_rep="")
df.hide(
[
row
for row in df.index
if (
not row.startswith(f"{tab}.")
or row.startswith(f"{tab}.leaderboard.")
or row.endswith(".alias")
or (not row.startswith(f"{tab}.{task}") if task != "All" else False)
)
],
axis="index",
)
start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
return df.to_html()
def update_tasks_component():
return gr.Radio(
["All"] + list(TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
interactive=True,
)
def clear_results():
# model_id_1, model_id_2, dataframe_1, dataframe_2, task
return (
None, None, None, None,
gr.Radio(
["All"] + list(TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
interactive=False,
),
)
def update_subtasks_component(task):
return gr.Radio(
SUBTASKS.get(task),
info="Evaluation subtasks to be displayed",
value=None,
)
def update_load_details_component(model_id_1, model_id_2, subtask):
if (model_id_1 or model_id_2) and subtask:
return gr.Button("Load Details", interactive=True)
else:
return gr.Button("Load Details", interactive=False)
def load_details_dataframe(model_id, subtask):
if not model_id or not subtask:
return
model_name_sanitized = model_id.replace("/", "__")
paths = fs.glob(
f"{DETAILS_DATASET_ID}/**/{DETAILS_FILENAME}".format(
model_name_sanitized=model_name_sanitized, subtask=subtask
)
)
if not paths:
return
path = max(paths)
with fs.open(path, "r") as f:
data = [json.loads(line) for line in f]
df = pd.json_normalize(data)
# df = df.rename_axis("Parameters", axis="columns")
df["model_name"] = model_id # Keep model_name
return df
# return df.set_index(pd.Index([model_id])).reset_index()
def load_details_dataframes(subtask, *model_ids):
return [load_details_dataframe(model_id, subtask) for model_id in model_ids]
def display_details(sample_idx, *dfs):
rows = [df.iloc[sample_idx] for df in dfs if "model_name" in df.columns and sample_idx < len(df)]
if not rows:
return
# Pop model_name and add it to the column name
df = pd.concat([row.rename(row.pop("model_name")) for row in rows], axis="columns")
return (
df.style
.format(na_rep="")
# .hide(axis="index")
.to_html()
)
def update_sample_idx_component(*dfs):
maximum = max([len(df) - 1 for df in dfs])
return gr.Number(
label="Sample Index",
info="Index of the sample to be displayed",
value=0,
minimum=0,
maximum=maximum,
visible=True,
)
# if __name__ == "__main__":
latest_result_path_per_model = filter_latest_result_path_per_model(fetch_result_paths())
with gr.Blocks(fill_height=True) as demo:
gr.HTML("<h1 style='text-align: center;'>Compare Results of the 🤗 Open LLM Leaderboard</h1>")
gr.HTML("<h3 style='text-align: center;'>Select 2 models to load and compare their results</h3>")
with gr.Row():
with gr.Column():
model_id_1 = gr.Dropdown(choices=list(latest_result_path_per_model.keys()), label="Models")
dataframe_1 = gr.Dataframe(visible=False)
with gr.Column():
model_id_2 = gr.Dropdown(choices=list(latest_result_path_per_model.keys()), label="Models")
dataframe_2 = gr.Dataframe(visible=False)
with gr.Row():
# with gr.Tab("All"):
# pass
with gr.Tab("Results"):
task = gr.Radio(
["All"] + list(TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
interactive=False,
)
load_results_btn = gr.Button("Load Results", interactive=False)
clear_results_btn = gr.Button("Clear Results")
with gr.Tab("Results"):
results = gr.HTML()
with gr.Tab("Configs"):
configs = gr.HTML()
with gr.Tab("Details"):
details_task = gr.Radio(
["All"] + list(TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
interactive=True,
)
subtask = gr.Radio(
SUBTASKS.get(details_task.value),
label="Subtasks",
info="Evaluation subtasks to be displayed (choose one of the Tasks above)",
)
load_details_btn = gr.Button("Load Details", interactive=False)
sample_idx = gr.Number(
label="Sample Index",
info="Index of the sample to be displayed",
value=0,
minimum=0,
visible=False
)
details = gr.HTML()
details_dataframe_1 = gr.Dataframe(visible=False)
details_dataframe_2 = gr.Dataframe(visible=False)
details_dataframe = gr.DataFrame(visible=False)
model_id_1.change(
fn=update_load_results_component,
outputs=load_results_btn,
)
load_results_btn.click(
fn=load_results_dataframes,
inputs=[model_id_1, model_id_2],
outputs=[dataframe_1, dataframe_2],
).then(
fn=update_tasks_component,
outputs=task,
)
gr.on(
triggers=[dataframe_1.change, dataframe_2.change, task.change],
fn=display_results,
inputs=[task, dataframe_1, dataframe_2],
outputs=[results, configs],
)
clear_results_btn.click(
fn=clear_results,
outputs=[model_id_1, model_id_2, dataframe_1, dataframe_2, task],
)
details_task.change(
fn=update_subtasks_component,
inputs=details_task,
outputs=subtask,
)
gr.on(
triggers=[model_id_1.change, model_id_2.change, subtask.change, details_task.change],
fn=update_load_details_component,
inputs=[model_id_1, model_id_2, subtask],
outputs=load_details_btn,
)
load_details_btn.click(
fn=load_details_dataframes,
inputs=[subtask, model_id_1, model_id_2],
outputs=[details_dataframe_1, details_dataframe_2],
).then(
fn=update_sample_idx_component,
inputs=[details_dataframe_1, details_dataframe_2],
outputs=sample_idx,
)
gr.on(
triggers=[details_dataframe_1.change, details_dataframe_2.change, sample_idx.change],
fn=display_details,
inputs=[sample_idx, details_dataframe_1, details_dataframe_2],
outputs=details,
)
demo.launch()
|