File size: 7,241 Bytes
c2c9efa
 
25557b5
 
841e241
 
ca2b34f
c2c9efa
 
8e404a5
6679087
07448fb
e611814
8e404a5
 
e611814
4289e9d
e611814
023a289
c1fc7f4
7a0e5b8
 
 
e611814
 
8e404a5
6679087
e611814
8e404a5
6679087
e611814
 
05c90f4
9c39267
 
 
3caeacd
 
 
 
bf6ab81
3caeacd
ca2b34f
 
 
 
 
9c39267
 
 
 
 
 
 
 
 
 
 
ca2b34f
 
 
 
 
9c39267
7379857
3caeacd
5009abb
3caeacd
71dfe85
3caeacd
 
ca2b34f
 
 
 
7379857
3caeacd
7379857
71dfe85
7379857
c8b695a
07448fb
bd858f5
 
 
 
 
 
 
7379857
 
 
 
25557b5
1f43e72
9c39267
3caeacd
9c39267
3caeacd
9c39267
 
99aea78
 
 
5b4c5f8
3caeacd
9c39267
ddc25db
9c39267
 
 
ca2b34f
 
 
 
 
 
 
 
 
 
 
8f68cc2
9c39267
6679087
9c39267
6679087
7379857
9c39267
 
54202cb
9c39267
54202cb
7379857
3caeacd
ca2b34f
 
 
 
3caeacd
 
 
 
c8b695a
1c1cb58
c8b695a
 
 
 
7379857
99aea78
 
 
bd858f5
 
 
 
7379857
0d84f54
 
7379857
eec78c0
7379857
e611814
07448fb
 
1c1cb58
07448fb
e611814
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
from functools import partial

import gradio as gr

from src.constants import SUBTASKS, TASKS
from src.details import update_subtasks_component, update_load_details_component, load_details_dataframes, \
    display_details, update_sample_idx_component, clear_details, update_task_description_component
from src.results import update_load_results_component, \
    load_results_dataframes, display_results, update_tasks_component, clear_results, \
    sort_result_paths_per_model, fetch_result_paths


# if __name__ == "__main__":
result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
load_results_dataframes = partial(load_results_dataframes, result_paths_per_model=result_paths_per_model)

with gr.Blocks(fill_height=True, fill_width=True) as demo:
    gr.HTML("<h1 style='text-align: center;'>Compare Results of the πŸ€— Open LLM Leaderboard</h1>")
    gr.HTML("<h3 style='text-align: center;'>Select 2 models to load and compare their results</h3>")
    gr.HTML("<p style='text-align: center; color:orange;'>&#9888; This demo is a beta version and may contain bugs, performance issues, incomplete features, or unexpected behavior. We appreciate your understanding and welcome any feedback through the Community tab to help improve the final product.</p>")
    gr.Markdown("Compare Results of the πŸ€— [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard-old/open_llm_leaderboard). "
                "Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) πŸ“„ to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
    )
    with gr.Row():
        with gr.Column():
            model_id_1 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
            dataframe_1 = gr.Dataframe(visible=False)
        with gr.Column():
            model_id_2 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
            dataframe_2 = gr.Dataframe(visible=False)

    with gr.Row():
        with gr.Tab("Results"):
            load_results_btn = gr.Button("Load", interactive=False)
            clear_results_btn = gr.Button("Clear")
            results_task = gr.Radio(
                ["All"] + list(TASKS.values()),
                label="Tasks",
                info="Evaluation tasks to be displayed",
                value="All",
                visible=False,
            )
            results_task_description = gr.Textbox(
                label="Task Description",
                lines=3,
                visible=False,
            )
            results = gr.HTML()
        with gr.Tab("Configs"):
            load_configs_btn = gr.Button("Load", interactive=False)
            clear_configs_btn = gr.Button("Clear")
            configs_task = gr.Radio(
                ["All"] + list(TASKS.values()),
                label="Tasks",
                info="Evaluation tasks to be displayed",
                value="All",
                visible=False,
            )
            configs_task_description = gr.Textbox(
                label="Task Description",
                lines=3,
                visible=False,
            )
            configs = gr.HTML()
        with gr.Tab("Details"):
            details_task = gr.Radio(
                list(value for value in TASKS.values() if value[1] != "leaderboard_gpqa"),
                label="Tasks",
                info="Evaluation tasks to be loaded",
                interactive=True,
            )
            details_task_description = gr.Textbox(
                label="Task Description",
                lines=3,
            )
            subtask = gr.Radio(
                SUBTASKS.get(details_task.value),
                label="Subtasks",
                info="Evaluation subtasks to be loaded (choose one of the Tasks above)",
            )
            load_details_btn = gr.Button("Load Details", interactive=False)
            clear_details_btn = gr.Button("Clear Details")
            sample_idx = gr.Number(
                label="Sample Index",
                info="Index of the sample to be displayed",
                value=0,
                minimum=0,
                visible=False
            )
            details = gr.HTML()
            details_dataframe_1 = gr.Dataframe(visible=False)
            details_dataframe_2 = gr.Dataframe(visible=False)
            details_dataframe = gr.DataFrame(visible=False)

    gr.on(
        triggers=[model_id_1.input, model_id_2.input],
        fn=update_load_results_component,
        outputs=[load_results_btn, load_configs_btn],
    )
    gr.on(
        triggers=[load_results_btn.click, load_configs_btn.click],
        fn=load_results_dataframes,
        inputs=[model_id_1, model_id_2],
        outputs=[dataframe_1, dataframe_2],
    ).then(
        fn=update_tasks_component,
        outputs=[results_task, configs_task],
    )
    # Synchronize the results_task and configs_task radio buttons
    results_task.input(fn=lambda task: task, inputs=results_task, outputs=configs_task)
    configs_task.input(fn=lambda task: task, inputs=configs_task, outputs=results_task)
    # Update task descriptions
    results_task.change(
        fn=update_task_description_component,
        inputs=results_task,
        outputs=results_task_description,
    ).then(
        fn=update_task_description_component,
        inputs=results_task,
        outputs=configs_task_description,
    )
    # Display results
    gr.on(
        triggers=[dataframe_1.change, dataframe_2.change, results_task.change],
        fn=display_results,
        inputs=[results_task, dataframe_1, dataframe_2],
        outputs=[results, configs],
    )
    gr.on(
        triggers=[clear_results_btn.click, clear_configs_btn.click],
        fn=clear_results,
        outputs=[model_id_1, model_id_2, dataframe_1, dataframe_2, load_results_btn, load_configs_btn, results_task, configs_task],
    )

    details_task.change(
        fn=update_task_description_component,
        inputs=details_task,
        outputs=details_task_description,
    ).then(
        fn=update_subtasks_component,
        inputs=details_task,
        outputs=subtask,
    )
    gr.on(
        triggers=[model_id_1.input, model_id_2.input, subtask.input, details_task.input],
        fn=update_load_details_component,
        inputs=[model_id_1, model_id_2, subtask],
        outputs=load_details_btn,
    )
    load_details_btn.click(
        fn=load_details_dataframes,
        inputs=[subtask, model_id_1, model_id_2],
        outputs=[details_dataframe_1, details_dataframe_2],
    ).then(
        fn=update_sample_idx_component,
        inputs=[details_dataframe_1, details_dataframe_2],
        outputs=sample_idx,
    )
    gr.on(
        triggers=[details_dataframe_1.change, details_dataframe_2.change, sample_idx.change],
        fn=display_details,
        inputs=[sample_idx, details_dataframe_1, details_dataframe_2],
        outputs=details,
    )
    clear_details_btn.click(
        fn=clear_details,
        outputs=[model_id_1, model_id_2, details_dataframe_1, details_dataframe_2, details_task, subtask, load_details_btn, sample_idx],
    )

demo.launch()