File size: 4,436 Bytes
d0f55c6
15c8167
 
26e855f
15c8167
460930f
15c8167
30a0c61
d0f55c6
15c8167
 
 
460930f
30a0c61
15c8167
 
 
8e404a5
15c8167
 
 
 
611a3ed
15c8167
8e404a5
15c8167
 
 
611a3ed
15c8167
 
d0f55c6
8e404a5
c2c9efa
8e404a5
d0f55c6
8e404a5
d0f55c6
 
 
 
da4a3b1
15c8167
 
 
 
d0f55c6
611a3ed
 
 
d0f55c6
15c8167
 
f12aa56
15c8167
 
 
 
 
54e105e
 
f12aa56
54e105e
15c8167
 
f12aa56
 
 
bd64e7a
15c8167
 
 
 
 
 
 
 
611a3ed
 
 
 
 
54e105e
 
f12aa56
 
15c8167
 
 
 
26e855f
15c8167
 
 
 
 
 
9c39267
 
30a0c61
15c8167
 
 
bf6ab81
9c39267
 
15c8167
 
 
9c39267
15c8167
611a3ed
 
 
 
 
9c39267
 
30a0c61
9c39267
 
 
 
 
611a3ed
 
15c8167
26e855f
 
 
a4b20f4
33d0dfb
26e855f
 
8f7c83f
 
 
611a3ed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import asyncio

import gradio as gr
import numpy as np
import pandas as pd
from huggingface_hub import HfFileSystem

import src.constants as constants
from src.hub import load_file


def fetch_result_paths():
    fs = HfFileSystem()
    paths = fs.glob(f"{constants.RESULTS_DATASET_ID}/**/**/*.json")
    return paths


def sort_result_paths_per_model(paths):
    from collections import defaultdict

    d = defaultdict(list)
    for path in paths:
        model_id, _ = path[len(constants.RESULTS_DATASET_ID) + 1 :].rsplit("/", 1)
        d[model_id].append(path)
    return {model_id: sorted(paths) for model_id, paths in d.items()}


def update_load_results_component():
    return (gr.Button("Load", interactive=True),) * 2


async def load_results_dataframe(model_id, result_paths_per_model=None):
    if not model_id or not result_paths_per_model:
        return
    result_paths = result_paths_per_model[model_id]
    results = await asyncio.gather(*[load_file(path) for path in result_paths])
    data = {"results": {}, "configs": {}}
    for result in results:
        data["results"].update(result["results"])
        data["configs"].update(result["configs"])
        model_name = result.get("model_name", "Model")
    df = pd.json_normalize([data])
    # df.columns = df.columns.str.split(".")  # .split return a list instead of a tuple
    return df.set_index(pd.Index([model_name])).reset_index()


async def load_results_dataframes(*model_ids, result_paths_per_model=None):
    result = await asyncio.gather(
        *[load_results_dataframe(model_id, result_paths_per_model) for model_id in model_ids]
    )
    return result


def display_results(task, hide_errors, show_only_differences, *dfs):
    dfs = [df.set_index("index") for df in dfs if "index" in df.columns]
    if not dfs:
        return None, None
    df = pd.concat(dfs)
    df = df.T.rename_axis(columns=None)
    return (
        display_tab("results", df, task, hide_errors=hide_errors),
        display_tab("configs", df, task, show_only_differences=show_only_differences),
    )


def display_tab(tab, df, task, hide_errors=True, show_only_differences=False):
    if show_only_differences:
        any_difference = df.ne(df.iloc[:, 0], axis=0).any(axis=1)
    df = df.style.format(escape="html", na_rep="")
    df.hide(
        [
            row
            for row in df.index
            if (
                not row.startswith(f"{tab}.")
                or row.startswith(f"{tab}.leaderboard.")
                or row.endswith(".alias")
                or (
                    not row.startswith(f"{tab}.{task}")
                    if task != "All"
                    else row.startswith(f"{tab}.leaderboard_arc_challenge")
                )
                # Hide errors
                or (hide_errors and row.endswith("_stderr,none"))
                # Hide non-different rows
                or (show_only_differences and not any_difference[row])
            )
        ],
        axis="index",
    )
    df.apply(highlight_min_max, axis=1)
    start = len(f"{tab}.leaderboard_") if task == "All" else len(f"{tab}.{task} ")
    df.format_index(lambda idx: idx[start:].removesuffix(",none"), axis="index")
    return df.to_html()


def update_tasks_component():
    return (
        gr.Radio(
            ["All"] + list(constants.TASKS.values()),
            label="Tasks",
            info="Evaluation tasks to be displayed",
            value="All",
            visible=True,
        ),
    ) * 2


def clear_results():
    # model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task
    return (
        None,
        None,
        None,
        None,
        *(gr.Button("Load", interactive=False),) * 2,
        *(
            gr.Radio(
                ["All"] + list(constants.TASKS.values()),
                label="Tasks",
                info="Evaluation tasks to be displayed",
                value="All",
                visible=False,
            ),
        )
        * 2,
    )


def highlight_min_max(s):
    if s.name.endswith("acc,none") or s.name.endswith("acc_norm,none") or s.name.endswith("exact_match,none"):
        return np.where(s == np.nanmax(s.values), "background-color:green", "background-color:#D81B60")
    else:
        return [""] * len(s)


def display_loading_message_for_results():
    return ("<h3 style='text-align: center;'>Loading...</h3>",) * 2