File size: 3,649 Bytes
15c8167
 
 
26e855f
15c8167
460930f
15c8167
 
 
 
 
460930f
15c8167
 
 
 
8e404a5
15c8167
 
 
 
 
 
8e404a5
15c8167
 
 
9c39267
15c8167
 
8e404a5
 
c2c9efa
8e404a5
460930f
8e404a5
 
 
 
 
 
 
15c8167
 
 
 
 
8e404a5
 
15c8167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6099782
15c8167
 
 
 
26e855f
15c8167
 
 
 
 
 
9c39267
 
15c8167
 
 
 
bf6ab81
9c39267
 
15c8167
 
 
9c39267
15c8167
 
9c39267
 
 
 
 
 
 
 
 
 
15c8167
26e855f
 
 
a4b20f4
 
26e855f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import json

import gradio as gr
import numpy as np
import pandas as pd
from huggingface_hub import HfFileSystem

from src.constants import RESULTS_DATASET_ID, TASKS


def fetch_result_paths():
    fs = HfFileSystem()
    paths = fs.glob(f"{RESULTS_DATASET_ID}/**/**/*.json")
    return paths


def sort_result_paths_per_model(paths):
    from collections import defaultdict

    d = defaultdict(list)
    for path in paths:
        model_id, _ = path[len(RESULTS_DATASET_ID) + 1:].rsplit("/", 1)
        d[model_id].append(path)
    return {model_id: sorted(paths) for model_id, paths in d.items()}


def update_load_results_component():
    return (gr.Button("Load", interactive=True), ) * 2


def load_results_dataframe(model_id, result_paths_per_model=None):
    if not model_id or not result_paths_per_model:
        return
    result_paths = result_paths_per_model[model_id]
    fs = HfFileSystem()
    data = {"results": {}, "configs": {}}
    for path in result_paths:
        with fs.open(path, "r") as f:
            d = json.load(f)
        data["results"].update(d["results"])
        data["configs"].update(d["configs"])
        model_name = d.get("model_name", "Model")
    df = pd.json_normalize([{key: value for key, value in data.items()}])
    # df.columns = df.columns.str.split(".")  # .split return a list instead of a tuple
    return df.set_index(pd.Index([model_name])).reset_index()


def load_results_dataframes(*model_ids, result_paths_per_model=None):
    return [load_results_dataframe(model_id, result_paths_per_model=result_paths_per_model) for model_id in model_ids]


def display_results(task, *dfs):
    dfs = [df.set_index("index") for df in dfs if "index" in df.columns]
    if not dfs:
        return None, None
    df = pd.concat(dfs)
    df = df.T.rename_axis(columns=None)
    return display_tab("results", df, task), display_tab("configs", df, task)


def display_tab(tab, df, task):
    df = df.style.format(na_rep="")
    df.hide(
        [
            row
            for row in df.index
            if (
                not row.startswith(f"{tab}.")
                or row.startswith(f"{tab}.leaderboard.")
                or row.endswith(".alias")
                or (not row.startswith(f"{tab}.{task}") if task != "All" else row.startswith(f"{tab}.leaderboard_arc_challenge"))
            )
        ],
        axis="index",
    )
    df.apply(highlight_min_max, axis=1)
    start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
    df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
    return df.to_html()


def update_tasks_component():
    return (
        gr.Radio(
            ["All"] + list(TASKS.values()),
            label="Tasks",
            info="Evaluation tasks to be displayed",
            value="All",
            visible=True,
        ),
    ) * 2


def clear_results():
    # model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task
    return (
        None, None, None, None,
        *(gr.Button("Load", interactive=False), ) * 2,
        *(
            gr.Radio(
                ["All"] + list(TASKS.values()),
                label="Tasks",
                info="Evaluation tasks to be displayed",
                value="All",
                visible=False,
            ),
        ) * 2,
    )


def highlight_min_max(s):
    if s.name.endswith("acc,none") or s.name.endswith("acc_norm,none") or s.name.endswith("exact_match,none"):
        return np.where(s == np.nanmax(s.values), "background-color:green", "background-color:red")
    else:
        return [""] * len(s)