File size: 4,597 Bytes
d0f55c6 15c8167 30a0c61 7e32ac7 15c8167 7e32ac7 15c8167 8e404a5 15c8167 611a3ed 15c8167 8e404a5 15c8167 611a3ed 15c8167 d0f55c6 8e404a5 c2c9efa 8e404a5 fae0e19 8e404a5 d0f55c6 da4a3b1 15c8167 d0f55c6 611a3ed d0f55c6 15c8167 585c3fa 15c8167 54e105e 585c3fa f12aa56 54e105e 15c8167 585c3fa f12aa56 bd64e7a 581682a 15c8167 611a3ed 585c3fa f12aa56 15c8167 581682a 41fbe9f 581682a 15c8167 9c39267 30a0c61 15c8167 bf6ab81 9c39267 15c8167 9c39267 15c8167 611a3ed 9c39267 30a0c61 9c39267 611a3ed 15c8167 26e855f 8f7c83f 611a3ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import asyncio
import gradio as gr
import pandas as pd
import src.constants as constants
from src.hub import glob, load_json_file
def fetch_result_paths():
path = f"{constants.RESULTS_DATASET_ID}/**/**/*.json"
return glob(path)
def sort_result_paths_per_model(paths):
from collections import defaultdict
d = defaultdict(list)
for path in paths:
model_id, _ = path[len(constants.RESULTS_DATASET_ID) + 1 :].rsplit("/", 1)
d[model_id].append(path)
return {model_id: sorted(paths) for model_id, paths in d.items()}
def update_load_results_component():
return (gr.Button("Load", interactive=True),) * 2
async def load_results_dataframe(model_id, result_paths_per_model=None):
if not model_id or not result_paths_per_model:
return
result_paths = result_paths_per_model[model_id]
results = await asyncio.gather(*[load_json_file(path) for path in result_paths])
data = {"results": {}, "configs": {}}
for result in results:
data["results"].update(result["results"])
data["configs"].update(result["configs"])
model_name = result.get("model_name", "Model")
df = pd.json_normalize([data])
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
return df.set_index(pd.Index([model_name])).reset_index()
async def load_results_dataframes(*model_ids, result_paths_per_model=None):
result = await asyncio.gather(
*[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids]
)
return result
def display_results(task, hide_std_errors, show_only_differences, *dfs):
dfs = [df.set_index("index") for df in dfs if "index" in df.columns]
if not dfs:
return None, None
df = pd.concat(dfs)
df = df.T.rename_axis(columns=None)
return (
display_tab("results", df, task, hide_std_errors=hide_std_errors),
display_tab("configs", df, task, show_only_differences=show_only_differences),
)
def display_tab(tab, df, task, hide_std_errors=True, show_only_differences=False):
if show_only_differences:
any_difference = df.ne(df.iloc[:, 0], axis=0).any(axis=1)
df = df.style.format(escape="html", na_rep="")
# Hide rows
df.hide(
[
row
for row in df.index
if (
not row.startswith(f"{tab}.")
or row.startswith(f"{tab}.leaderboard.")
or row.endswith(".alias")
or (
not row.startswith(f"{tab}.{task}")
if task != "All"
else row.startswith(f"{tab}.leaderboard_arc_challenge")
)
# Hide std errors
or (hide_std_errors and row.endswith("_stderr,none"))
# Hide non-different rows
or (show_only_differences and not any_difference[row])
)
],
axis="index",
)
# Color metric result cells
idx = pd.IndexSlice
colored_rows = idx[
[
row
for row in df.index
if row.endswith("acc,none") or row.endswith("acc_norm,none") or row.endswith("exact_match,none")
]
] # Apply only on numeric cells, otherwise the background gradient will not work
subset = idx[colored_rows, idx[:]]
df.background_gradient(cmap="PiYG", vmin=0, vmax=1, subset=subset, axis=None)
# Format index values: remove prefix and suffix
start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
return df.to_html()
def update_tasks_component():
return (
gr.Radio(
["All"] + list(constants.TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
visible=True,
),
) * 2
def clear_results():
# model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task
return (
None,
None,
None,
None,
*(gr.Button("Load", interactive=False),) * 2,
*(
gr.Radio(
["All"] + list(constants.TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
visible=False,
),
)
* 2,
)
def display_loading_message_for_results():
return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2
|