Spaces:

open-llm-leaderboard
/

comparator

Running

File size: 8,419 Bytes

c2c9efa
 
25557b5
 
30a0c61
611a3ed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07448fb
e611814
8e404a5
 
e611814
4289e9d
e611814
023a289
19a6010
 
 
 
 
 
7a0e5b8
e611814
 
8e404a5
6679087
e611814
8e404a5
6679087
e611814
 
05c90f4
9c39267
 
 
30a0c61
3caeacd
 
 
bf6ab81
3caeacd
ca2b34f
 
 
 
 
585c3fa
9c39267
 
 
 
 
30a0c61
9c39267
 
 
 
 
ca2b34f
 
 
 
 
f12aa56
9c39267
7379857
3caeacd
26ef426
3caeacd
71dfe85
3caeacd
 
ca2b34f
 
 
 
26ef426
 
 
 
 
 
 
c8b695a
07448fb
bd858f5
611a3ed
bd858f5
6cf57e4
7379857
 
 
 
25557b5
1f43e72
9c39267
3caeacd
9c39267
3caeacd
9c39267
 
8f7c83f
 
 
99aea78
 
 
5b4c5f8
3caeacd
9c39267
ddc25db
9c39267
 
 
ca2b34f
 
 
 
 
 
 
 
 
 
 
8f68cc2
f12aa56
 
 
 
585c3fa
f12aa56
 
6679087
585c3fa
6679087
7379857
9c39267
 
54202cb
611a3ed
 
 
 
 
 
 
 
 
 
54202cb
7379857
8f7c83f
3caeacd
ca2b34f
 
 
 
3caeacd
 
26ef426
3caeacd
c8b695a
1c1cb58
c8b695a
 
 
 
7379857
8f7c83f
 
 
99aea78
 
 
bd858f5
 
 
 
7379857
0d84f54
6cf57e4
 
 
 
 
 
7379857
6cf57e4
7379857
e611814
07448fb
 
611a3ed
 
 
 
 
 
 
 
 
 
07448fb
e611814

from functools import partial

import gradio as gr

import src.constants as constants
from src.details import (
    clear_details,
    display_details,
    display_loading_message_for_details,
    load_details_dataframes,
    update_load_details_component,
    update_sample_idx_component,
    update_subtasks_component,
    update_task_description_component,
)
from src.results import (
    clear_results,
    display_loading_message_for_results,
    display_results,
    fetch_result_paths,
    load_results_dataframes,
    sort_result_paths_per_model,
    update_load_results_component,
    update_tasks_component,
)


# if __name__ == "__main__":
result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
load_results_dataframes = partial(load_results_dataframes, result_paths_per_model=result_paths_per_model)

with gr.Blocks(fill_height=True, fill_width=True) as demo:
    gr.HTML("<h1 style='text-align: center;'>Compare Results of the 🤗 Open LLM Leaderboard</h1>")
    gr.HTML("<h3 style='text-align: center;'>Select 2 models to load and compare their results</h3>")
    gr.HTML(
        "<p style='text-align: center; color:orange;'>⚠ This demo is a beta version, and we're actively working on it, so you might find some tiny bugs! Please report any issues you have in the Community tab to help us make it better for all.</p>"
    )
    gr.Markdown(
        "Compare Results of the 🤗 [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). "
        "Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) 📄 to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
    )
    with gr.Row():
        with gr.Column():
            model_id_1 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
            dataframe_1 = gr.Dataframe(visible=False)
        with gr.Column():
            model_id_2 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
            dataframe_2 = gr.Dataframe(visible=False)

    with gr.Row():
        with gr.Tab("Results"):
            load_results_btn = gr.Button("Load", interactive=False)
            clear_results_btn = gr.Button("Clear")
            results_task = gr.Radio(
                ["All"] + list(constants.TASKS.values()),
                label="Tasks",
                info="Evaluation tasks to be displayed",
                value="All",
                visible=False,
            )
            results_task_description = gr.Textbox(
                label="Task Description",
                lines=3,
                visible=False,
            )
            hide_std_errors = gr.Checkbox(label="Hide Standard Errors", value=True, info="Options")
            results = gr.HTML()
        with gr.Tab("Configs"):
            load_configs_btn = gr.Button("Load", interactive=False)
            clear_configs_btn = gr.Button("Clear")
            configs_task = gr.Radio(
                ["All"] + list(constants.TASKS.values()),
                label="Tasks",
                info="Evaluation tasks to be displayed",
                value="All",
                visible=False,
            )
            configs_task_description = gr.Textbox(
                label="Task Description",
                lines=3,
                visible=False,
            )
            show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
            configs = gr.HTML()
        with gr.Tab("Details"):
            details_task = gr.Radio(
                list(constants.TASKS.values()),
                label="Tasks",
                info="Evaluation tasks to be loaded",
                interactive=True,
            )
            details_task_description = gr.Textbox(
                label="Task Description",
                lines=3,
            )
            with gr.Row():
                login_btn = gr.LoginButton(size="sm", visible=False)
                subtask = gr.Radio(
                    choices=None,  # constants.SUBTASKS.get(details_task.value),
                    label="Subtasks",
                    info="Evaluation subtasks to be loaded (choose one of the Tasks above)",
                )
            load_details_btn = gr.Button("Load Details", interactive=False)
            clear_details_btn = gr.Button("Clear Details")
            sample_idx = gr.Number(
                label="Sample Index", info="Index of the sample to be displayed", value=0, minimum=0, visible=False
            )
            details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
            details = gr.HTML()
            details_dataframe_1 = gr.Dataframe(visible=False)
            details_dataframe_2 = gr.Dataframe(visible=False)
            details_dataframe = gr.DataFrame(visible=False)

    gr.on(
        triggers=[model_id_1.input, model_id_2.input],
        fn=update_load_results_component,
        outputs=[load_results_btn, load_configs_btn],
    )
    gr.on(
        triggers=[load_results_btn.click, load_configs_btn.click],
        fn=display_loading_message_for_results,
        outputs=[results, configs],
    ).then(
        fn=load_results_dataframes,
        inputs=[model_id_1, model_id_2],
        outputs=[dataframe_1, dataframe_2],
    ).then(
        fn=update_tasks_component,
        outputs=[results_task, configs_task],
    )
    # Synchronize the results_task and configs_task radio buttons
    results_task.input(fn=lambda task: task, inputs=results_task, outputs=configs_task)
    configs_task.input(fn=lambda task: task, inputs=configs_task, outputs=results_task)
    # Update task descriptions
    results_task.change(
        fn=update_task_description_component,
        inputs=results_task,
        outputs=results_task_description,
    ).then(
        fn=update_task_description_component,
        inputs=results_task,
        outputs=configs_task_description,
    )
    # Display results
    gr.on(
        triggers=[
            dataframe_1.change,
            dataframe_2.change,
            results_task.change,
            hide_std_errors.change,
            show_only_differences.change,
        ],
        fn=display_results,
        inputs=[results_task, hide_std_errors, show_only_differences, dataframe_1, dataframe_2],
        outputs=[results, configs],
    )
    gr.on(
        triggers=[clear_results_btn.click, clear_configs_btn.click],
        fn=clear_results,
        outputs=[
            model_id_1,
            model_id_2,
            dataframe_1,
            dataframe_2,
            load_results_btn,
            load_configs_btn,
            results_task,
            configs_task,
        ],
    )

    # DETAILS:
    details_task.change(
        fn=update_task_description_component,
        inputs=details_task,
        outputs=details_task_description,
    ).then(
        fn=update_subtasks_component,
        inputs=details_task,
        outputs=[login_btn, subtask],
    )
    gr.on(
        triggers=[model_id_1.input, model_id_2.input, subtask.input, details_task.input],
        fn=update_load_details_component,
        inputs=[model_id_1, model_id_2, subtask],
        outputs=load_details_btn,
    )
    load_details_btn.click(
        fn=display_loading_message_for_details,
        outputs=details,
    ).then(
        fn=load_details_dataframes,
        inputs=[subtask, model_id_1, model_id_2],
        outputs=[details_dataframe_1, details_dataframe_2],
    ).then(
        fn=update_sample_idx_component,
        inputs=[details_dataframe_1, details_dataframe_2],
        outputs=sample_idx,
    )
    gr.on(
        triggers=[
            details_dataframe_1.change,
            details_dataframe_2.change,
            sample_idx.change,
            details_show_only_differences.change,
        ],
        fn=display_details,
        inputs=[sample_idx, details_show_only_differences, details_dataframe_1, details_dataframe_2],
        outputs=details,
    )
    clear_details_btn.click(
        fn=clear_details,
        outputs=[
            model_id_1,
            model_id_2,
            details_dataframe_1,
            details_dataframe_2,
            details_task,
            subtask,
            load_details_btn,
            sample_idx,
        ],
    )

demo.launch()