Spaces:
Runtime error
Runtime error
Clémentine
commited on
Commit
•
9166535
1
Parent(s):
33475fb
???
Browse files- app.py +22 -260
- app_bkp.py +316 -0
- src/display/about.py +2 -85
app.py
CHANGED
@@ -37,205 +37,33 @@ from src.envs import (
|
|
37 |
REPO_ID,
|
38 |
HF_HOME,
|
39 |
)
|
40 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
41 |
-
from src.submission.submit import add_new_eval
|
42 |
-
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
43 |
-
|
44 |
-
# Configure logging
|
45 |
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
46 |
-
|
47 |
-
|
48 |
-
# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
|
49 |
-
# This controls whether a full initialization should be performed.
|
50 |
-
DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
|
51 |
-
LAST_UPDATE_LEADERBOARD = datetime.datetime.now()
|
52 |
-
|
53 |
-
def restart_space():
|
54 |
-
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
55 |
-
|
56 |
-
|
57 |
-
def time_diff_wrapper(func):
|
58 |
-
def wrapper(*args, **kwargs):
|
59 |
-
start_time = time.time()
|
60 |
-
result = func(*args, **kwargs)
|
61 |
-
end_time = time.time()
|
62 |
-
diff = end_time - start_time
|
63 |
-
logging.info(f"Time taken for {func.__name__}: {diff} seconds")
|
64 |
-
return result
|
65 |
-
|
66 |
-
return wrapper
|
67 |
-
|
68 |
-
|
69 |
-
@time_diff_wrapper
|
70 |
-
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
71 |
-
"""Download dataset with exponential backoff retries."""
|
72 |
-
attempt = 0
|
73 |
-
while attempt < max_attempts:
|
74 |
-
try:
|
75 |
-
logging.info(f"Downloading {repo_id} to {local_dir}")
|
76 |
-
snapshot_download(
|
77 |
-
repo_id=repo_id,
|
78 |
-
local_dir=local_dir,
|
79 |
-
repo_type=repo_type,
|
80 |
-
tqdm_class=None,
|
81 |
-
etag_timeout=30,
|
82 |
-
max_workers=8,
|
83 |
-
)
|
84 |
-
logging.info("Download successful")
|
85 |
-
return
|
86 |
-
except Exception as e:
|
87 |
-
wait_time = backoff_factor**attempt
|
88 |
-
logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
|
89 |
-
time.sleep(wait_time)
|
90 |
-
attempt += 1
|
91 |
-
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
92 |
-
|
93 |
-
def get_latest_data_leaderboard(leaderboard_initial_df = None):
|
94 |
-
current_time = datetime.datetime.now()
|
95 |
-
global LAST_UPDATE_LEADERBOARD
|
96 |
-
if current_time - LAST_UPDATE_LEADERBOARD < datetime.timedelta(minutes=10) and leaderboard_initial_df is not None:
|
97 |
-
return leaderboard_initial_df
|
98 |
-
LAST_UPDATE_LEADERBOARD = current_time
|
99 |
-
leaderboard_dataset = datasets.load_dataset(
|
100 |
-
AGGREGATED_REPO,
|
101 |
-
"default",
|
102 |
-
split="train",
|
103 |
-
cache_dir=HF_HOME,
|
104 |
-
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
105 |
-
verification_mode="no_checks"
|
106 |
-
)
|
107 |
-
|
108 |
-
leaderboard_df = get_leaderboard_df(
|
109 |
-
leaderboard_dataset=leaderboard_dataset,
|
110 |
-
cols=COLS,
|
111 |
-
benchmark_cols=BENCHMARK_COLS,
|
112 |
-
)
|
113 |
-
|
114 |
-
return leaderboard_df
|
115 |
-
|
116 |
-
def get_latest_data_queue():
|
117 |
-
eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
118 |
-
return eval_queue_dfs
|
119 |
-
|
120 |
-
def init_space():
|
121 |
-
"""Initializes the application space, loading only necessary data."""
|
122 |
-
if DO_FULL_INIT:
|
123 |
-
# These downloads only occur on full initialization
|
124 |
-
try:
|
125 |
-
download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
126 |
-
except Exception:
|
127 |
-
restart_space()
|
128 |
-
|
129 |
-
# Always redownload the leaderboard DataFrame
|
130 |
-
leaderboard_df = get_latest_data_leaderboard()
|
131 |
-
|
132 |
-
# Evaluation queue DataFrame retrieval is independent of initialization detail level
|
133 |
-
eval_queue_dfs = get_latest_data_queue()
|
134 |
-
|
135 |
-
return leaderboard_df, eval_queue_dfs
|
136 |
-
|
137 |
-
|
138 |
-
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
139 |
-
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
140 |
-
leaderboard_df, eval_queue_dfs = init_space()
|
141 |
-
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
|
142 |
-
|
143 |
-
|
144 |
-
# Data processing for plots now only on demand in the respective Gradio tab
|
145 |
-
def load_and_create_plots():
|
146 |
-
plot_df = create_plot_df(create_scores_df(leaderboard_df))
|
147 |
-
return plot_df
|
148 |
-
|
149 |
-
def init_leaderboard(dataframe):
|
150 |
-
return Leaderboard(
|
151 |
-
value = dataframe,
|
152 |
-
datatype=[c.type for c in fields(AutoEvalColumn)],
|
153 |
-
select_columns=SelectColumns(
|
154 |
-
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
155 |
-
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
|
156 |
-
label="Select Columns to Display:",
|
157 |
-
),
|
158 |
-
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
|
159 |
-
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
160 |
-
filter_columns=[
|
161 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
162 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
163 |
-
ColumnFilter(
|
164 |
-
AutoEvalColumn.params.name,
|
165 |
-
type="slider",
|
166 |
-
min=0.01,
|
167 |
-
max=150,
|
168 |
-
label="Select the number of parameters (B)",
|
169 |
-
),
|
170 |
-
ColumnFilter(
|
171 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
|
172 |
-
),
|
173 |
-
ColumnFilter(
|
174 |
-
AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
|
175 |
-
),
|
176 |
-
ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
|
177 |
-
ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
|
178 |
-
],
|
179 |
-
bool_checkboxgroup_label="Hide models",
|
180 |
-
interactive=False,
|
181 |
-
)
|
182 |
|
183 |
demo = gr.Blocks(css=custom_css)
|
184 |
with demo:
|
185 |
gr.HTML(TITLE)
|
186 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
187 |
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
gr.Plot(value=chart, min_width=500)
|
210 |
-
|
211 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
212 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
213 |
-
|
214 |
-
with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
|
215 |
-
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
216 |
-
|
217 |
-
with gr.TabItem("🚀 Submit? ", elem_id="llm-benchmark-tab-table", id=5):
|
218 |
-
countdown = gr.HTML(
|
219 |
-
"""<div align="center">
|
220 |
-
<div position: relative>
|
221 |
-
<img
|
222 |
-
src="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/resolve/main/gif.gif"
|
223 |
-
allowtransparency="true"
|
224 |
-
style="display:block;width:100%;height:auto;"
|
225 |
-
/>
|
226 |
-
<iframe
|
227 |
-
src="https://logwork.com/widget/countdown/?text=Surprise%20loading...&timezone=Europe%2FParis&width=&style=circles&uid=815898&loc=https://logwork.com/countdown-fxmc&language=en&textcolor=&background=%23ffd21e&date=2024-06-26%2015%3A00%3A00&digitscolor=%23ff9d00&unitscolor=&"
|
228 |
-
style="position: absolute; top:0; left: 0; border: medium; width:100%; height:100%; margin: 0px; visibility: visible;"
|
229 |
-
scrolling="no"
|
230 |
-
allowtransparency="true"
|
231 |
-
frameborder="0"
|
232 |
-
allowfullscreen
|
233 |
-
/>
|
234 |
-
</div>
|
235 |
-
</div>"""
|
236 |
-
)
|
237 |
-
#gif = gr.Image(value="./gif.gif", interactive=False)
|
238 |
-
gr.Markdown("*Countdown by Logwork.com, gif art by Chun Te Lee*")
|
239 |
|
240 |
with gr.Row():
|
241 |
with gr.Accordion("📙 Citation", open=False):
|
@@ -247,70 +75,4 @@ with demo:
|
|
247 |
show_copy_button=True,
|
248 |
)
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
demo.queue(default_concurrency_limit=40)
|
254 |
-
|
255 |
-
# Start ephemeral Spaces on PRs (see config in README.md)
|
256 |
-
from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
|
257 |
-
|
258 |
-
def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
|
259 |
-
# Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
|
260 |
-
# Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
|
261 |
-
# ht to Lucain!
|
262 |
-
if SPACE_ID is None:
|
263 |
-
print("Not in a Space: Space CI disabled.")
|
264 |
-
return WebhooksServer(ui=demo)
|
265 |
-
|
266 |
-
if IS_EPHEMERAL_SPACE:
|
267 |
-
print("In an ephemeral Space: Space CI disabled.")
|
268 |
-
return WebhooksServer(ui=demo)
|
269 |
-
|
270 |
-
card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
|
271 |
-
config = card.data.get("space_ci", {})
|
272 |
-
print(f"Enabling Space CI with config from README: {config}")
|
273 |
-
|
274 |
-
return configure_space_ci(
|
275 |
-
blocks=ui,
|
276 |
-
trusted_authors=config.get("trusted_authors"),
|
277 |
-
private=config.get("private", "auto"),
|
278 |
-
variables=config.get("variables", "auto"),
|
279 |
-
secrets=config.get("secrets"),
|
280 |
-
hardware=config.get("hardware"),
|
281 |
-
storage=config.get("storage"),
|
282 |
-
)
|
283 |
-
|
284 |
-
# Create webhooks server (with CI url if in Space and not ephemeral)
|
285 |
-
webhooks_server = enable_space_ci_and_return_server(ui=demo)
|
286 |
-
|
287 |
-
# Add webhooks
|
288 |
-
@webhooks_server.add_webhook
|
289 |
-
def update_leaderboard(payload: WebhookPayload) -> None:
|
290 |
-
"""Redownloads the leaderboard dataset each time it updates"""
|
291 |
-
if payload.repo.type == "dataset" and payload.event.action == "update":
|
292 |
-
datasets.load_dataset(
|
293 |
-
AGGREGATED_REPO,
|
294 |
-
"default",
|
295 |
-
split="train",
|
296 |
-
cache_dir=HF_HOME,
|
297 |
-
download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
|
298 |
-
verification_mode="no_checks"
|
299 |
-
)
|
300 |
-
|
301 |
-
# The below code is not used at the moment, as we can manage the queue file locally
|
302 |
-
LAST_UPDATE_QUEUE = datetime.datetime.now()
|
303 |
-
@webhooks_server.add_webhook
|
304 |
-
def update_queue(payload: WebhookPayload) -> None:
|
305 |
-
"""Redownloads the queue dataset each time it updates"""
|
306 |
-
if payload.repo.type == "dataset" and payload.event.action == "update":
|
307 |
-
current_time = datetime.datetime.now()
|
308 |
-
global LAST_UPDATE_QUEUE
|
309 |
-
if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
|
310 |
-
print("Would have updated the queue")
|
311 |
-
# We only redownload is last update was more than 10 minutes ago, as the queue is
|
312 |
-
# updated regularly and heavy to download
|
313 |
-
#download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
314 |
-
LAST_UPDATE_QUEUE = datetime.datetime.now()
|
315 |
-
|
316 |
-
webhooks_server.launch()
|
|
|
37 |
REPO_ID,
|
38 |
HF_HOME,
|
39 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
demo = gr.Blocks(css=custom_css)
|
42 |
with demo:
|
43 |
gr.HTML(TITLE)
|
44 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
45 |
|
46 |
+
countdown = gr.HTML(
|
47 |
+
"""<div align="center">
|
48 |
+
<div position: relative>
|
49 |
+
<img
|
50 |
+
src="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/resolve/main/gif.gif"
|
51 |
+
allowtransparency="true"
|
52 |
+
style="display:block;width:100%;height:auto;"
|
53 |
+
/>
|
54 |
+
<iframe
|
55 |
+
src="https://logwork.com/widget/countdown/?text=Surprise%20loading...&timezone=Europe%2FParis&width=&style=circles&uid=815898&loc=https://logwork.com/countdown-fxmc&language=en&textcolor=&background=%23ffd21e&date=2024-06-26%2015%3A00%3A00&digitscolor=%23ff9d00&unitscolor=&"
|
56 |
+
style="position: absolute; top:0; left: 0; border: medium; width:100%; height:100%; margin: 0px; visibility: visible;"
|
57 |
+
scrolling="no"
|
58 |
+
allowtransparency="true"
|
59 |
+
frameborder="0"
|
60 |
+
allowfullscreen
|
61 |
+
/>
|
62 |
+
</div>
|
63 |
+
</div>"""
|
64 |
+
)
|
65 |
+
#gif = gr.Image(value="./gif.gif", interactive=False)
|
66 |
+
gr.Markdown("*Countdown by Logwork.com, gif art by Chun Te Lee*")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
with gr.Row():
|
69 |
with gr.Accordion("📙 Citation", open=False):
|
|
|
75 |
show_copy_button=True,
|
76 |
)
|
77 |
|
78 |
+
demo.queue(default_concurrency_limit=40).launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app_bkp.py
ADDED
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
import time
|
4 |
+
import datetime
|
5 |
+
import gradio as gr
|
6 |
+
import datasets
|
7 |
+
from huggingface_hub import snapshot_download, WebhooksServer, WebhookPayload, RepoCard
|
8 |
+
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
9 |
+
|
10 |
+
from src.display.about import (
|
11 |
+
CITATION_BUTTON_LABEL,
|
12 |
+
CITATION_BUTTON_TEXT,
|
13 |
+
EVALUATION_QUEUE_TEXT,
|
14 |
+
FAQ_TEXT,
|
15 |
+
INTRODUCTION_TEXT,
|
16 |
+
LLM_BENCHMARKS_TEXT,
|
17 |
+
TITLE,
|
18 |
+
)
|
19 |
+
from src.display.css_html_js import custom_css
|
20 |
+
from src.display.utils import (
|
21 |
+
BENCHMARK_COLS,
|
22 |
+
COLS,
|
23 |
+
EVAL_COLS,
|
24 |
+
EVAL_TYPES,
|
25 |
+
AutoEvalColumn,
|
26 |
+
ModelType,
|
27 |
+
Precision,
|
28 |
+
WeightType,
|
29 |
+
fields,
|
30 |
+
)
|
31 |
+
from src.envs import (
|
32 |
+
API,
|
33 |
+
EVAL_REQUESTS_PATH,
|
34 |
+
AGGREGATED_REPO,
|
35 |
+
HF_TOKEN,
|
36 |
+
QUEUE_REPO,
|
37 |
+
REPO_ID,
|
38 |
+
HF_HOME,
|
39 |
+
)
|
40 |
+
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
41 |
+
from src.submission.submit import add_new_eval
|
42 |
+
from src.tools.plots import create_metric_plot_obj, create_plot_df, create_scores_df
|
43 |
+
|
44 |
+
# Configure logging
|
45 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
46 |
+
|
47 |
+
|
48 |
+
# Convert the environment variable "LEADERBOARD_FULL_INIT" to a boolean value, defaulting to True if the variable is not set.
|
49 |
+
# This controls whether a full initialization should be performed.
|
50 |
+
DO_FULL_INIT = os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
|
51 |
+
LAST_UPDATE_LEADERBOARD = datetime.datetime.now()
|
52 |
+
|
53 |
+
def restart_space():
|
54 |
+
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
55 |
+
|
56 |
+
|
57 |
+
def time_diff_wrapper(func):
|
58 |
+
def wrapper(*args, **kwargs):
|
59 |
+
start_time = time.time()
|
60 |
+
result = func(*args, **kwargs)
|
61 |
+
end_time = time.time()
|
62 |
+
diff = end_time - start_time
|
63 |
+
logging.info(f"Time taken for {func.__name__}: {diff} seconds")
|
64 |
+
return result
|
65 |
+
|
66 |
+
return wrapper
|
67 |
+
|
68 |
+
|
69 |
+
@time_diff_wrapper
|
70 |
+
def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, backoff_factor=1.5):
|
71 |
+
"""Download dataset with exponential backoff retries."""
|
72 |
+
attempt = 0
|
73 |
+
while attempt < max_attempts:
|
74 |
+
try:
|
75 |
+
logging.info(f"Downloading {repo_id} to {local_dir}")
|
76 |
+
snapshot_download(
|
77 |
+
repo_id=repo_id,
|
78 |
+
local_dir=local_dir,
|
79 |
+
repo_type=repo_type,
|
80 |
+
tqdm_class=None,
|
81 |
+
etag_timeout=30,
|
82 |
+
max_workers=8,
|
83 |
+
)
|
84 |
+
logging.info("Download successful")
|
85 |
+
return
|
86 |
+
except Exception as e:
|
87 |
+
wait_time = backoff_factor**attempt
|
88 |
+
logging.error(f"Error downloading {repo_id}: {e}, retrying in {wait_time}s")
|
89 |
+
time.sleep(wait_time)
|
90 |
+
attempt += 1
|
91 |
+
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
92 |
+
|
93 |
+
def get_latest_data_leaderboard(leaderboard_initial_df = None):
|
94 |
+
current_time = datetime.datetime.now()
|
95 |
+
global LAST_UPDATE_LEADERBOARD
|
96 |
+
if current_time - LAST_UPDATE_LEADERBOARD < datetime.timedelta(minutes=10) and leaderboard_initial_df is not None:
|
97 |
+
return leaderboard_initial_df
|
98 |
+
LAST_UPDATE_LEADERBOARD = current_time
|
99 |
+
leaderboard_dataset = datasets.load_dataset(
|
100 |
+
AGGREGATED_REPO,
|
101 |
+
"default",
|
102 |
+
split="train",
|
103 |
+
cache_dir=HF_HOME,
|
104 |
+
download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS, # Uses the cached dataset
|
105 |
+
verification_mode="no_checks"
|
106 |
+
)
|
107 |
+
|
108 |
+
leaderboard_df = get_leaderboard_df(
|
109 |
+
leaderboard_dataset=leaderboard_dataset,
|
110 |
+
cols=COLS,
|
111 |
+
benchmark_cols=BENCHMARK_COLS,
|
112 |
+
)
|
113 |
+
|
114 |
+
return leaderboard_df
|
115 |
+
|
116 |
+
def get_latest_data_queue():
|
117 |
+
eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
118 |
+
return eval_queue_dfs
|
119 |
+
|
120 |
+
def init_space():
|
121 |
+
"""Initializes the application space, loading only necessary data."""
|
122 |
+
if DO_FULL_INIT:
|
123 |
+
# These downloads only occur on full initialization
|
124 |
+
try:
|
125 |
+
download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
126 |
+
except Exception:
|
127 |
+
restart_space()
|
128 |
+
|
129 |
+
# Always redownload the leaderboard DataFrame
|
130 |
+
leaderboard_df = get_latest_data_leaderboard()
|
131 |
+
|
132 |
+
# Evaluation queue DataFrame retrieval is independent of initialization detail level
|
133 |
+
eval_queue_dfs = get_latest_data_queue()
|
134 |
+
|
135 |
+
return leaderboard_df, eval_queue_dfs
|
136 |
+
|
137 |
+
|
138 |
+
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
139 |
+
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
140 |
+
leaderboard_df, eval_queue_dfs = init_space()
|
141 |
+
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
|
142 |
+
|
143 |
+
|
144 |
+
# Data processing for plots now only on demand in the respective Gradio tab
|
145 |
+
def load_and_create_plots():
|
146 |
+
plot_df = create_plot_df(create_scores_df(leaderboard_df))
|
147 |
+
return plot_df
|
148 |
+
|
149 |
+
def init_leaderboard(dataframe):
|
150 |
+
return Leaderboard(
|
151 |
+
value = dataframe,
|
152 |
+
datatype=[c.type for c in fields(AutoEvalColumn)],
|
153 |
+
select_columns=SelectColumns(
|
154 |
+
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
155 |
+
cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy],
|
156 |
+
label="Select Columns to Display:",
|
157 |
+
),
|
158 |
+
search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.fullname.name, AutoEvalColumn.license.name],
|
159 |
+
hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
|
160 |
+
filter_columns=[
|
161 |
+
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
162 |
+
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
163 |
+
ColumnFilter(
|
164 |
+
AutoEvalColumn.params.name,
|
165 |
+
type="slider",
|
166 |
+
min=0.01,
|
167 |
+
max=150,
|
168 |
+
label="Select the number of parameters (B)",
|
169 |
+
),
|
170 |
+
ColumnFilter(
|
171 |
+
AutoEvalColumn.still_on_hub.name, type="boolean", label="Private or deleted", default=True
|
172 |
+
),
|
173 |
+
ColumnFilter(
|
174 |
+
AutoEvalColumn.merged.name, type="boolean", label="Contains a merge/moerge", default=True
|
175 |
+
),
|
176 |
+
ColumnFilter(AutoEvalColumn.moe.name, type="boolean", label="MoE", default=False),
|
177 |
+
ColumnFilter(AutoEvalColumn.not_flagged.name, type="boolean", label="Flagged", default=True),
|
178 |
+
],
|
179 |
+
bool_checkboxgroup_label="Hide models",
|
180 |
+
interactive=False,
|
181 |
+
)
|
182 |
+
|
183 |
+
demo = gr.Blocks(css=custom_css)
|
184 |
+
with demo:
|
185 |
+
gr.HTML(TITLE)
|
186 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
187 |
+
|
188 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
189 |
+
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
190 |
+
leaderboard = init_leaderboard(leaderboard_df)
|
191 |
+
|
192 |
+
with gr.TabItem("📈 Metrics through time", elem_id="llm-benchmark-tab-table", id=2):
|
193 |
+
with gr.Row():
|
194 |
+
with gr.Column():
|
195 |
+
plot_df = load_and_create_plots()
|
196 |
+
chart = create_metric_plot_obj(
|
197 |
+
plot_df,
|
198 |
+
[AutoEvalColumn.average.name],
|
199 |
+
title="Average of Top Scores and Human Baseline Over Time (from last update)",
|
200 |
+
)
|
201 |
+
gr.Plot(value=chart, min_width=500)
|
202 |
+
with gr.Column():
|
203 |
+
plot_df = load_and_create_plots()
|
204 |
+
chart = create_metric_plot_obj(
|
205 |
+
plot_df,
|
206 |
+
BENCHMARK_COLS,
|
207 |
+
title="Top Scores and Human Baseline Over Time (from last update)",
|
208 |
+
)
|
209 |
+
gr.Plot(value=chart, min_width=500)
|
210 |
+
|
211 |
+
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
|
212 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
213 |
+
|
214 |
+
with gr.TabItem("❗FAQ", elem_id="llm-benchmark-tab-table", id=4):
|
215 |
+
gr.Markdown(FAQ_TEXT, elem_classes="markdown-text")
|
216 |
+
|
217 |
+
with gr.TabItem("🚀 Submit? ", elem_id="llm-benchmark-tab-table", id=5):
|
218 |
+
countdown = gr.HTML(
|
219 |
+
"""<div align="center">
|
220 |
+
<div position: relative>
|
221 |
+
<img
|
222 |
+
src="https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard/resolve/main/gif.gif"
|
223 |
+
allowtransparency="true"
|
224 |
+
style="display:block;width:100%;height:auto;"
|
225 |
+
/>
|
226 |
+
<iframe
|
227 |
+
src="https://logwork.com/widget/countdown/?text=Surprise%20loading...&timezone=Europe%2FParis&width=&style=circles&uid=815898&loc=https://logwork.com/countdown-fxmc&language=en&textcolor=&background=%23ffd21e&date=2024-06-26%2015%3A00%3A00&digitscolor=%23ff9d00&unitscolor=&"
|
228 |
+
style="position: absolute; top:0; left: 0; border: medium; width:100%; height:100%; margin: 0px; visibility: visible;"
|
229 |
+
scrolling="no"
|
230 |
+
allowtransparency="true"
|
231 |
+
frameborder="0"
|
232 |
+
allowfullscreen
|
233 |
+
/>
|
234 |
+
</div>
|
235 |
+
</div>"""
|
236 |
+
)
|
237 |
+
#gif = gr.Image(value="./gif.gif", interactive=False)
|
238 |
+
gr.Markdown("*Countdown by Logwork.com, gif art by Chun Te Lee*")
|
239 |
+
|
240 |
+
with gr.Row():
|
241 |
+
with gr.Accordion("📙 Citation", open=False):
|
242 |
+
citation_button = gr.Textbox(
|
243 |
+
value=CITATION_BUTTON_TEXT,
|
244 |
+
label=CITATION_BUTTON_LABEL,
|
245 |
+
lines=20,
|
246 |
+
elem_id="citation-button",
|
247 |
+
show_copy_button=True,
|
248 |
+
)
|
249 |
+
|
250 |
+
demo.load(fn=get_latest_data_leaderboard, inputs=[leaderboard], outputs=[leaderboard])
|
251 |
+
|
252 |
+
|
253 |
+
demo.queue(default_concurrency_limit=40)
|
254 |
+
|
255 |
+
# Start ephemeral Spaces on PRs (see config in README.md)
|
256 |
+
from gradio_space_ci.webhook import IS_EPHEMERAL_SPACE, SPACE_ID, configure_space_ci
|
257 |
+
|
258 |
+
def enable_space_ci_and_return_server(ui: gr.Blocks) -> WebhooksServer:
|
259 |
+
# Taken from https://huggingface.co/spaces/Wauplin/gradio-space-ci/blob/075119aee75ab5e7150bf0814eec91c83482e790/src/gradio_space_ci/webhook.py#L61
|
260 |
+
# Compared to original, this one do not monkeypatch Gradio which allows us to define more webhooks.
|
261 |
+
# ht to Lucain!
|
262 |
+
if SPACE_ID is None:
|
263 |
+
print("Not in a Space: Space CI disabled.")
|
264 |
+
return WebhooksServer(ui=demo)
|
265 |
+
|
266 |
+
if IS_EPHEMERAL_SPACE:
|
267 |
+
print("In an ephemeral Space: Space CI disabled.")
|
268 |
+
return WebhooksServer(ui=demo)
|
269 |
+
|
270 |
+
card = RepoCard.load(repo_id_or_path=SPACE_ID, repo_type="space")
|
271 |
+
config = card.data.get("space_ci", {})
|
272 |
+
print(f"Enabling Space CI with config from README: {config}")
|
273 |
+
|
274 |
+
return configure_space_ci(
|
275 |
+
blocks=ui,
|
276 |
+
trusted_authors=config.get("trusted_authors"),
|
277 |
+
private=config.get("private", "auto"),
|
278 |
+
variables=config.get("variables", "auto"),
|
279 |
+
secrets=config.get("secrets"),
|
280 |
+
hardware=config.get("hardware"),
|
281 |
+
storage=config.get("storage"),
|
282 |
+
)
|
283 |
+
|
284 |
+
# Create webhooks server (with CI url if in Space and not ephemeral)
|
285 |
+
webhooks_server = enable_space_ci_and_return_server(ui=demo)
|
286 |
+
|
287 |
+
# Add webhooks
|
288 |
+
@webhooks_server.add_webhook
|
289 |
+
def update_leaderboard(payload: WebhookPayload) -> None:
|
290 |
+
"""Redownloads the leaderboard dataset each time it updates"""
|
291 |
+
if payload.repo.type == "dataset" and payload.event.action == "update":
|
292 |
+
datasets.load_dataset(
|
293 |
+
AGGREGATED_REPO,
|
294 |
+
"default",
|
295 |
+
split="train",
|
296 |
+
cache_dir=HF_HOME,
|
297 |
+
download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
|
298 |
+
verification_mode="no_checks"
|
299 |
+
)
|
300 |
+
|
301 |
+
# The below code is not used at the moment, as we can manage the queue file locally
|
302 |
+
LAST_UPDATE_QUEUE = datetime.datetime.now()
|
303 |
+
@webhooks_server.add_webhook
|
304 |
+
def update_queue(payload: WebhookPayload) -> None:
|
305 |
+
"""Redownloads the queue dataset each time it updates"""
|
306 |
+
if payload.repo.type == "dataset" and payload.event.action == "update":
|
307 |
+
current_time = datetime.datetime.now()
|
308 |
+
global LAST_UPDATE_QUEUE
|
309 |
+
if current_time - LAST_UPDATE_QUEUE > datetime.timedelta(minutes=10):
|
310 |
+
print("Would have updated the queue")
|
311 |
+
# We only redownload is last update was more than 10 minutes ago, as the queue is
|
312 |
+
# updated regularly and heavy to download
|
313 |
+
#download_dataset(QUEUE_REPO, EVAL_REQUESTS_PATH)
|
314 |
+
LAST_UPDATE_QUEUE = datetime.datetime.now()
|
315 |
+
|
316 |
+
webhooks_server.launch()
|
src/display/about.py
CHANGED
@@ -219,89 +219,6 @@ CITATION_BUTTON_TEXT = r"""
|
|
219 |
publisher = {Hugging Face},
|
220 |
howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
|
221 |
}
|
222 |
-
|
223 |
-
|
224 |
-
Tow, Jonathan and
|
225 |
-
Biderman, Stella and
|
226 |
-
Black, Sid and
|
227 |
-
DiPofi, Anthony and
|
228 |
-
Foster, Charles and
|
229 |
-
Golding, Laurence and
|
230 |
-
Hsu, Jeffrey and
|
231 |
-
McDonell, Kyle and
|
232 |
-
Muennighoff, Niklas and
|
233 |
-
Phang, Jason and
|
234 |
-
Reynolds, Laria and
|
235 |
-
Tang, Eric and
|
236 |
-
Thite, Anish and
|
237 |
-
Wang, Ben and
|
238 |
-
Wang, Kevin and
|
239 |
-
Zou, Andy},
|
240 |
-
title = {A framework for few-shot language model evaluation},
|
241 |
-
month = sep,
|
242 |
-
year = 2021,
|
243 |
-
publisher = {Zenodo},
|
244 |
-
version = {v0.0.1},
|
245 |
-
doi = {10.5281/zenodo.5371628},
|
246 |
-
url = {https://doi.org/10.5281/zenodo.5371628}
|
247 |
-
}
|
248 |
-
@misc{clark2018think,
|
249 |
-
title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
|
250 |
-
author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
|
251 |
-
year={2018},
|
252 |
-
eprint={1803.05457},
|
253 |
-
archivePrefix={arXiv},
|
254 |
-
primaryClass={cs.AI}
|
255 |
-
}
|
256 |
-
@misc{zellers2019hellaswag,
|
257 |
-
title={HellaSwag: Can a Machine Really Finish Your Sentence?},
|
258 |
-
author={Rowan Zellers and Ari Holtzman and Yonatan Bisk and Ali Farhadi and Yejin Choi},
|
259 |
-
year={2019},
|
260 |
-
eprint={1905.07830},
|
261 |
-
archivePrefix={arXiv},
|
262 |
-
primaryClass={cs.CL}
|
263 |
-
}
|
264 |
-
@misc{hendrycks2021measuring,
|
265 |
-
title={Measuring Massive Multitask Language Understanding},
|
266 |
-
author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
|
267 |
-
year={2021},
|
268 |
-
eprint={2009.03300},
|
269 |
-
archivePrefix={arXiv},
|
270 |
-
primaryClass={cs.CY}
|
271 |
-
}
|
272 |
-
@misc{lin2022truthfulqa,
|
273 |
-
title={TruthfulQA: Measuring How Models Mimic Human Falsehoods},
|
274 |
-
author={Stephanie Lin and Jacob Hilton and Owain Evans},
|
275 |
-
year={2022},
|
276 |
-
eprint={2109.07958},
|
277 |
-
archivePrefix={arXiv},
|
278 |
-
primaryClass={cs.CL}
|
279 |
-
}
|
280 |
-
@misc{DBLP:journals/corr/abs-1907-10641,
|
281 |
-
title={{WINOGRANDE:} An Adversarial Winograd Schema Challenge at Scale},
|
282 |
-
author={Keisuke Sakaguchi and Ronan Le Bras and Chandra Bhagavatula and Yejin Choi},
|
283 |
-
year={2019},
|
284 |
-
eprint={1907.10641},
|
285 |
-
archivePrefix={arXiv},
|
286 |
-
primaryClass={cs.CL}
|
287 |
-
}
|
288 |
-
@misc{DBLP:journals/corr/abs-2110-14168,
|
289 |
-
title={Training Verifiers to Solve Math Word Problems},
|
290 |
-
author={Karl Cobbe and
|
291 |
-
Vineet Kosaraju and
|
292 |
-
Mohammad Bavarian and
|
293 |
-
Mark Chen and
|
294 |
-
Heewoo Jun and
|
295 |
-
Lukasz Kaiser and
|
296 |
-
Matthias Plappert and
|
297 |
-
Jerry Tworek and
|
298 |
-
Jacob Hilton and
|
299 |
-
Reiichiro Nakano and
|
300 |
-
Christopher Hesse and
|
301 |
-
John Schulman},
|
302 |
-
year={2021},
|
303 |
-
eprint={2110.14168},
|
304 |
-
archivePrefix={arXiv},
|
305 |
-
primaryClass={cs.CL}
|
306 |
-
}
|
307 |
"""
|
|
|
219 |
publisher = {Hugging Face},
|
220 |
howpublished = "\url{https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard}"
|
221 |
}
|
222 |
+
|
223 |
+
????
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
"""
|