File size: 11,485 Bytes
25557b5
8a91492
25557b5
30a0c61
611a3ed
 
 
 
bea7063
611a3ed
 
 
 
 
608184c
8a91492
523fad9
611a3ed
 
b1b50fb
611a3ed
 
b1b50fb
729af67
bea7063
07db628
611a3ed
 
 
07448fb
e611814
 
bea7063
e611814
bea7063
19a6010
 
 
 
 
 
7a0e5b8
e611814
729af67
 
e611814
523fad9
 
 
 
 
 
 
 
 
 
 
 
e611814
05c90f4
9c39267
 
 
30a0c61
3caeacd
 
 
bf6ab81
3caeacd
ca2b34f
 
 
 
 
585c3fa
a56da8a
 
 
9c39267
bea7063
b1b50fb
 
9c39267
 
 
 
30a0c61
9c39267
 
 
 
 
ca2b34f
 
 
 
 
f12aa56
9c39267
7379857
3caeacd
26ef426
3caeacd
71dfe85
3caeacd
 
ca2b34f
 
 
 
26ef426
 
 
 
 
 
 
c8b695a
07448fb
bd858f5
611a3ed
bd858f5
6cf57e4
7379857
bea7063
608184c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25557b5
729af67
 
 
 
 
 
 
 
 
 
523fad9
1f43e72
bea7063
608184c
 
3caeacd
523fad9
 
9c39267
608184c
8f7c83f
608184c
8f7c83f
bea7063
523fad9
 
 
 
 
966ae7b
5b4c5f8
3caeacd
9c39267
ddc25db
9c39267
 
 
ca2b34f
 
 
 
 
 
 
 
 
 
 
8f68cc2
f12aa56
bea7063
f12aa56
585c3fa
f12aa56
 
6679087
bea7063
608184c
07db628
 
bea7063
a56da8a
608184c
 
 
 
b1b50fb
 
 
 
 
 
 
 
7379857
9c39267
608184c
54202cb
611a3ed
bea7063
 
611a3ed
 
608184c
611a3ed
 
 
523fad9
 
 
 
 
 
b1b50fb
 
 
54202cb
7379857
8f7c83f
3caeacd
ca2b34f
 
 
 
3caeacd
 
26ef426
3caeacd
c8b695a
bea7063
c8b695a
bea7063
c8b695a
 
7379857
8f7c83f
 
 
bea7063
523fad9
 
 
 
 
966ae7b
bd858f5
 
bea7063
bd858f5
7379857
0d84f54
6cf57e4
bea7063
6cf57e4
 
 
7379857
bea7063
7379857
e611814
07448fb
 
611a3ed
bea7063
 
611a3ed
 
 
 
 
07448fb
e611814
523fad9
 
 
 
 
 
 
 
 
8a91492
 
 
 
 
e611814
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
import gradio as gr
from apscheduler.schedulers.background import BackgroundScheduler

import src.constants as constants
from src.details import (
    clear_details,
    display_details,
    display_loading_message_for_details,
    load_details,
    update_load_details_component,
    update_sample_idx_component,
    update_subtasks_component,
    update_task_description_component,
)
from src.env_impact import plot_env_impact
from src.hub import restart_space
from src.model_tree import load_model_tree
from src.results import (
    clear_results,
    clear_results_file,
    display_loading_message_for_results,
    display_results,
    download_results,
    load_result_paths_per_model,
    load_results,
    plot_results,
    update_tasks_component,
)


# if __name__ == "__main__":

with gr.Blocks(fill_height=True, fill_width=True) as demo:
    gr.HTML("<h1 style='text-align: center;'>Compare Results of the πŸ€— Open LLM Leaderboard</h1>")
    gr.HTML("<h3 style='text-align: center;'>Select models to load and compare their results</h3>")
    gr.HTML(
        "<p style='text-align: center; color:orange;'>⚠ This demo is a beta version, and we're actively working on it, so you might find some tiny bugs! Please report any issues you have in the Community tab to help us make it better for all.</p>"
    )
    gr.Markdown(
        "Compare Results of the πŸ€— [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). "
        "Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) πŸ“„ to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
    )
    with gr.Row():
        model_ids = gr.Dropdown(label="Models", multiselect=True)
        result_paths_per_model = gr.State()

    with gr.Accordion("Model tree: Compare base and derived models", open=False):
        load_model_tree_btn = gr.Button("Load Model Tree", interactive=False)
        model_tree_labels = [constants.BASE_MODEL_TYPE[0]] + [
            derived_model_type[0] for derived_model_type in constants.DERIVED_MODEL_TYPES
        ]
        base_and_derived_models = [
            gr.Dropdown(label=model_tree_labels[0], multiselect=True),
        ]
        with gr.Row():
            for label in model_tree_labels[1:]:
                base_and_derived_models.append(gr.Dropdown(label=label, multiselect=True, interactive=False))

    with gr.Row():
        with gr.Tab("Results"):
            load_results_btn = gr.Button("Load", interactive=False)
            clear_results_btn = gr.Button("Clear")
            results_task = gr.Radio(
                ["All"] + list(constants.TASKS.values()),
                label="Tasks",
                info="Evaluation tasks to be displayed",
                value="All",
                visible=False,
            )
            results_task_description = gr.Textbox(
                label="Task Description",
                lines=3,
                visible=False,
            )
            hide_std_errors = gr.Checkbox(label="Hide Standard Errors", value=True, info="Options")
            with gr.Row():
                results_plot_1 = gr.Plot(visible=True)
                results_plot_2 = gr.Plot(visible=True)
            results = gr.HTML()
            results_dataframe = gr.State()
            download_results_btn = gr.Button("Download")
            results_file = gr.File(visible=False)
        with gr.Tab("Configs"):
            load_configs_btn = gr.Button("Load", interactive=False)
            clear_configs_btn = gr.Button("Clear")
            configs_task = gr.Radio(
                ["All"] + list(constants.TASKS.values()),
                label="Tasks",
                info="Evaluation tasks to be displayed",
                value="All",
                visible=False,
            )
            configs_task_description = gr.Textbox(
                label="Task Description",
                lines=3,
                visible=False,
            )
            show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
            configs = gr.HTML()
        with gr.Tab("Details"):
            details_task = gr.Radio(
                list(constants.TASKS.values()),
                label="Tasks",
                info="Evaluation tasks to be loaded",
                interactive=True,
            )
            details_task_description = gr.Textbox(
                label="Task Description",
                lines=3,
            )
            with gr.Row():
                login_btn = gr.LoginButton(size="sm", visible=False)
                subtask = gr.Radio(
                    choices=None,  # constants.SUBTASKS.get(details_task.value),
                    label="Subtasks",
                    info="Evaluation subtasks to be loaded (choose one of the Tasks above)",
                )
            load_details_btn = gr.Button("Load Details", interactive=False)
            clear_details_btn = gr.Button("Clear Details")
            sample_idx = gr.Number(
                label="Sample Index", info="Index of the sample to be displayed", value=0, minimum=0, visible=False
            )
            details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
            details = gr.HTML()
            details_dataframe = gr.State()
        with gr.Tab("Environmental impact"):
            gr.Markdown(
                "The environmental impact calculations we display are derived from the specific inference setup used "
                "for evaluation. We leverage πŸ€— [Accelerate](https://huggingface.co/docs/accelerate) to efficiently "
                "parallelize the model across 8 Nvidia H100 SXM GPUs in a compute cluster located in Northern Virginia. "
                "These results reflect the energy consumption and associated emissions of this configuration, "
                "providing transparency and insight into the resource requirements of large language model evaluations. "
                "You can find more details in our documentation about the [environmental impact](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/emissions)."
            )
            load_env_impact_btn = gr.Button("Load", interactive=False)
            clear_env_impact_btn = gr.Button("Clear")
            with gr.Row():
                env_impact_plot_1 = gr.Plot(visible=True)
                env_impact_plot_2 = gr.Plot(visible=True)
            env_impact = gr.HTML()

    # DEMO:
    demo.load(
        fn=load_result_paths_per_model,
        outputs=result_paths_per_model,
    ).then(
        fn=lambda x: gr.Dropdown(choices=list(x.keys())),
        inputs=result_paths_per_model,
        outputs=model_ids,
    )

    # Buttons:
    gr.on(
        triggers=[model_ids.input],
        fn=lambda: (gr.Button(interactive=True),) * 4,
        outputs=[load_model_tree_btn, load_results_btn, load_configs_btn, load_env_impact_btn],
    )

    # RESULTS:
    gr.on(
        triggers=[load_results_btn.click, load_configs_btn.click, load_env_impact_btn.click],
        fn=display_loading_message_for_results,
        outputs=[results, configs, env_impact],
    ).then(
        fn=load_results,
        inputs=[
            result_paths_per_model,
            model_ids,
            *base_and_derived_models,
        ],
        outputs=[results_dataframe, results],
    ).then(
        fn=update_tasks_component,
        outputs=[results_task, configs_task],
    )
    # Synchronize the results_task and configs_task radio buttons
    results_task.input(fn=lambda task: task, inputs=results_task, outputs=configs_task)
    configs_task.input(fn=lambda task: task, inputs=configs_task, outputs=results_task)
    # Update task descriptions
    results_task.change(
        fn=update_task_description_component,
        inputs=results_task,
        outputs=results_task_description,
    ).then(
        fn=update_task_description_component,
        inputs=results_task,
        outputs=configs_task_description,
    )
    # Display results
    gr.on(
        triggers=[
            results_dataframe.change,
            results_task.change,
            hide_std_errors.change,
            show_only_differences.change,
        ],
        fn=display_results,
        inputs=[results_dataframe, results_task, hide_std_errors, show_only_differences],
        outputs=[results, configs, env_impact],
    ).then(
        fn=plot_results,
        inputs=[results_dataframe, results_task],
        outputs=[results_plot_1, results_plot_2],
    ).then(
        fn=plot_env_impact,
        inputs=[results_dataframe],
        outputs=[env_impact_plot_1, env_impact_plot_2],
    ).then(
        fn=clear_results_file,
        outputs=results_file,
    )
    download_results_btn.click(
        fn=download_results,
        inputs=results,
        outputs=results_file,
    )
    gr.on(
        triggers=[clear_results_btn.click, clear_configs_btn.click, clear_env_impact_btn.click],
        fn=clear_results,
        outputs=[
            model_ids,
            results_dataframe,
            load_results_btn,
            load_configs_btn,
            load_env_impact_btn,
            results_task,
            configs_task,
        ],
    ).then(
        fn=lambda: gr.Button(interactive=False),
        outputs=load_model_tree_btn,
    ).then(
        fn=lambda: [gr.Dropdown(label=label, multiselect=True, interactive=False) for label in model_tree_labels],
        outputs=[*base_and_derived_models],
    ).then(
        fn=clear_results_file,
        outputs=results_file,
    )

    # DETAILS:
    details_task.change(
        fn=update_task_description_component,
        inputs=details_task,
        outputs=details_task_description,
    ).then(
        fn=update_subtasks_component,
        inputs=details_task,
        outputs=[login_btn, subtask],
    )
    gr.on(
        triggers=[model_ids.input, subtask.input, details_task.input],
        fn=update_load_details_component,
        inputs=[model_ids, subtask],
        outputs=load_details_btn,
    )
    load_details_btn.click(
        fn=display_loading_message_for_details,
        outputs=details,
    ).then(
        fn=load_details,
        inputs=[
            subtask,
            model_ids,
            *base_and_derived_models,
        ],
        outputs=[details_dataframe, details],
    ).then(
        fn=update_sample_idx_component,
        inputs=[details_dataframe],
        outputs=sample_idx,
    )
    gr.on(
        triggers=[
            details_dataframe.change,
            sample_idx.change,
            details_show_only_differences.change,
        ],
        fn=display_details,
        inputs=[details_dataframe, sample_idx, details_show_only_differences],
        outputs=details,
    )
    clear_details_btn.click(
        fn=clear_details,
        outputs=[
            model_ids,
            details_dataframe,
            details_task,
            subtask,
            load_details_btn,
            sample_idx,
        ],
    )

    # MODEL TREE:
    load_model_tree_btn.click(
        fn=load_model_tree,
        inputs=[result_paths_per_model, model_ids],
        outputs=[
            *base_and_derived_models,
        ],
    )

# Start scheduler
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", hours=1)  # Restart every 1h
scheduler.start()

demo.launch()