File size: 8,419 Bytes
c2c9efa 25557b5 30a0c61 611a3ed 07448fb e611814 8e404a5 e611814 4289e9d e611814 023a289 19a6010 7a0e5b8 e611814 8e404a5 6679087 e611814 8e404a5 6679087 e611814 05c90f4 9c39267 30a0c61 3caeacd bf6ab81 3caeacd ca2b34f 585c3fa 9c39267 30a0c61 9c39267 ca2b34f f12aa56 9c39267 7379857 3caeacd 26ef426 3caeacd 71dfe85 3caeacd ca2b34f 26ef426 c8b695a 07448fb bd858f5 611a3ed bd858f5 6cf57e4 7379857 25557b5 1f43e72 9c39267 3caeacd 9c39267 3caeacd 9c39267 8f7c83f 99aea78 5b4c5f8 3caeacd 9c39267 ddc25db 9c39267 ca2b34f 8f68cc2 f12aa56 585c3fa f12aa56 6679087 585c3fa 6679087 7379857 9c39267 54202cb 611a3ed 54202cb 7379857 8f7c83f 3caeacd ca2b34f 3caeacd 26ef426 3caeacd c8b695a 1c1cb58 c8b695a 7379857 8f7c83f 99aea78 bd858f5 7379857 0d84f54 6cf57e4 7379857 6cf57e4 7379857 e611814 07448fb 611a3ed 07448fb e611814 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
from functools import partial
import gradio as gr
import src.constants as constants
from src.details import (
clear_details,
display_details,
display_loading_message_for_details,
load_details_dataframes,
update_load_details_component,
update_sample_idx_component,
update_subtasks_component,
update_task_description_component,
)
from src.results import (
clear_results,
display_loading_message_for_results,
display_results,
fetch_result_paths,
load_results_dataframes,
sort_result_paths_per_model,
update_load_results_component,
update_tasks_component,
)
# if __name__ == "__main__":
result_paths_per_model = sort_result_paths_per_model(fetch_result_paths())
load_results_dataframes = partial(load_results_dataframes, result_paths_per_model=result_paths_per_model)
with gr.Blocks(fill_height=True, fill_width=True) as demo:
gr.HTML("<h1 style='text-align: center;'>Compare Results of the π€ Open LLM Leaderboard</h1>")
gr.HTML("<h3 style='text-align: center;'>Select 2 models to load and compare their results</h3>")
gr.HTML(
"<p style='text-align: center; color:orange;'>β This demo is a beta version, and we're actively working on it, so you might find some tiny bugs! Please report any issues you have in the Community tab to help us make it better for all.</p>"
)
gr.Markdown(
"Compare Results of the π€ [Open LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). "
"Check out the [documentation](https://huggingface.co/docs/leaderboards/open_llm_leaderboard/about) π to find explanations on the evaluations used, their configuration parameters and details on the input/outputs for the models."
)
with gr.Row():
with gr.Column():
model_id_1 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
dataframe_1 = gr.Dataframe(visible=False)
with gr.Column():
model_id_2 = gr.Dropdown(choices=list(result_paths_per_model.keys()), label="Models")
dataframe_2 = gr.Dataframe(visible=False)
with gr.Row():
with gr.Tab("Results"):
load_results_btn = gr.Button("Load", interactive=False)
clear_results_btn = gr.Button("Clear")
results_task = gr.Radio(
["All"] + list(constants.TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
visible=False,
)
results_task_description = gr.Textbox(
label="Task Description",
lines=3,
visible=False,
)
hide_std_errors = gr.Checkbox(label="Hide Standard Errors", value=True, info="Options")
results = gr.HTML()
with gr.Tab("Configs"):
load_configs_btn = gr.Button("Load", interactive=False)
clear_configs_btn = gr.Button("Clear")
configs_task = gr.Radio(
["All"] + list(constants.TASKS.values()),
label="Tasks",
info="Evaluation tasks to be displayed",
value="All",
visible=False,
)
configs_task_description = gr.Textbox(
label="Task Description",
lines=3,
visible=False,
)
show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
configs = gr.HTML()
with gr.Tab("Details"):
details_task = gr.Radio(
list(constants.TASKS.values()),
label="Tasks",
info="Evaluation tasks to be loaded",
interactive=True,
)
details_task_description = gr.Textbox(
label="Task Description",
lines=3,
)
with gr.Row():
login_btn = gr.LoginButton(size="sm", visible=False)
subtask = gr.Radio(
choices=None, # constants.SUBTASKS.get(details_task.value),
label="Subtasks",
info="Evaluation subtasks to be loaded (choose one of the Tasks above)",
)
load_details_btn = gr.Button("Load Details", interactive=False)
clear_details_btn = gr.Button("Clear Details")
sample_idx = gr.Number(
label="Sample Index", info="Index of the sample to be displayed", value=0, minimum=0, visible=False
)
details_show_only_differences = gr.Checkbox(label="Show Only Differences", value=False, info="Options")
details = gr.HTML()
details_dataframe_1 = gr.Dataframe(visible=False)
details_dataframe_2 = gr.Dataframe(visible=False)
details_dataframe = gr.DataFrame(visible=False)
gr.on(
triggers=[model_id_1.input, model_id_2.input],
fn=update_load_results_component,
outputs=[load_results_btn, load_configs_btn],
)
gr.on(
triggers=[load_results_btn.click, load_configs_btn.click],
fn=display_loading_message_for_results,
outputs=[results, configs],
).then(
fn=load_results_dataframes,
inputs=[model_id_1, model_id_2],
outputs=[dataframe_1, dataframe_2],
).then(
fn=update_tasks_component,
outputs=[results_task, configs_task],
)
# Synchronize the results_task and configs_task radio buttons
results_task.input(fn=lambda task: task, inputs=results_task, outputs=configs_task)
configs_task.input(fn=lambda task: task, inputs=configs_task, outputs=results_task)
# Update task descriptions
results_task.change(
fn=update_task_description_component,
inputs=results_task,
outputs=results_task_description,
).then(
fn=update_task_description_component,
inputs=results_task,
outputs=configs_task_description,
)
# Display results
gr.on(
triggers=[
dataframe_1.change,
dataframe_2.change,
results_task.change,
hide_std_errors.change,
show_only_differences.change,
],
fn=display_results,
inputs=[results_task, hide_std_errors, show_only_differences, dataframe_1, dataframe_2],
outputs=[results, configs],
)
gr.on(
triggers=[clear_results_btn.click, clear_configs_btn.click],
fn=clear_results,
outputs=[
model_id_1,
model_id_2,
dataframe_1,
dataframe_2,
load_results_btn,
load_configs_btn,
results_task,
configs_task,
],
)
# DETAILS:
details_task.change(
fn=update_task_description_component,
inputs=details_task,
outputs=details_task_description,
).then(
fn=update_subtasks_component,
inputs=details_task,
outputs=[login_btn, subtask],
)
gr.on(
triggers=[model_id_1.input, model_id_2.input, subtask.input, details_task.input],
fn=update_load_details_component,
inputs=[model_id_1, model_id_2, subtask],
outputs=load_details_btn,
)
load_details_btn.click(
fn=display_loading_message_for_details,
outputs=details,
).then(
fn=load_details_dataframes,
inputs=[subtask, model_id_1, model_id_2],
outputs=[details_dataframe_1, details_dataframe_2],
).then(
fn=update_sample_idx_component,
inputs=[details_dataframe_1, details_dataframe_2],
outputs=sample_idx,
)
gr.on(
triggers=[
details_dataframe_1.change,
details_dataframe_2.change,
sample_idx.change,
details_show_only_differences.change,
],
fn=display_details,
inputs=[sample_idx, details_show_only_differences, details_dataframe_1, details_dataframe_2],
outputs=details,
)
clear_details_btn.click(
fn=clear_details,
outputs=[
model_id_1,
model_id_2,
details_dataframe_1,
details_dataframe_2,
details_task,
subtask,
load_details_btn,
sample_idx,
],
)
demo.launch()
|