comparator / src /results.py
albertvillanova's picture
Add checkbox in Configs to show only differences
f12aa56 verified
raw
history blame
4.44 kB
import asyncio
import gradio as gr
import numpy as np
import pandas as pd
from huggingface_hub import HfFileSystem
import src.constants as constants
from src.hub import load_file
def fetch_result_paths():
fs = HfFileSystem()
paths = fs.glob(f"{constants.RESULTS_DATASET_ID}/**/**/*.json")
return paths
def sort_result_paths_per_model(paths):
from collections import defaultdict
d = defaultdict(list)
for path in paths:
model_id, _ = path[len(constants.RESULTS_DATASET_ID) + 1 :].rsplit("/", 1)
d[model_id].append(path)
return {model_id: sorted(paths) for model_id, paths in d.items()}
def update_load_results_component():
return (gr.Button("Load", interactive=True),) * 2
async def load_results_dataframe(model_id, result_paths_per_model=None):
if not model_id or not result_paths_per_model:
return
result_paths = result_paths_per_model[model_id]
results = await asyncio.gather(*[load_file(path) for path in result_paths])
data = {"results": {}, "configs": {}}
for result in results:
data["results"].update(result["results"])
data["configs"].update(result["configs"])
model_name = result.get("model_name", "Model")
df = pd.json_normalize([data])
# df.columns = df.columns.str.split(".") # .split return a list instead of a tuple
return df.set_index(pd.Index([model_name])).reset_index()
async def load_results_dataframes(*model_ids, result_paths_per_model=None):
result = await asyncio.gather(
*[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids]
)
return result
def display_results(task, hide_errors, show_only_differences, *dfs):
dfs = [df.set_index("index") for df in dfs if "index" in df.columns]
if not dfs:
return None, None
df = pd.concat(dfs)
df = df.T.rename_axis(columns=None)
return (
display_tab("results", df, task, hide_errors=hide_errors),
display_tab("configs", df, task, show_only_differences=show_only_differences),
)
def display_tab(tab, df, task, hide_errors=True, show_only_differences=False):
if show_only_differences:
any_difference = df.ne(df.iloc[:, 0], axis=0).any(axis=1)
df = df.style.format(escape="html", na_rep="")
df.hide(
[
row
for row in df.index
if (
not row.startswith(f"{tab}.")
or row.startswith(f"{tab}.leaderboard.")
or row.endswith(".alias")
or (
not row.startswith(f"{tab}.{task}")
if task != "All"
else row.startswith(f"{tab}.leaderboard_arc_challenge")
)
# Hide errors
or (hide_errors and row.endswith("_stderr,none"))
# Hide non-different rows
or (show_only_differences and not any_difference[row])
)
],
axis="index",
)
df.apply(highlight_min_max, axis=1)
start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
return df.to_html()
def update_tasks_component():
return (
gr.Radio(
["All"] + list(constants.TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
visible=True,
),
) * 2
def clear_results():
# model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task
return (
None,
None,
None,
None,
*(gr.Button("Load", interactive=False),) * 2,
*(
gr.Radio(
["All"] + list(constants.TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
visible=False,
),
)
* 2,
)
def highlight_min_max(s):
if s.name.endswith("acc,none") or s.name.endswith("acc_norm,none") or s.name.endswith("exact_match,none"):
return np.where(s == np.nanmax(s.values), "background-color:green", "background-color:#D81B60")
else:
return [""] * len(s)
def display_loading_message_for_results():
return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2