Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
alozowski
commited on
Commit
•
34f418b
1
Parent(s):
37b898a
Improved leaderboard update [wip]
Browse files- app.py +18 -50
- src/leaderboard/data.py +79 -0
app.py
CHANGED
@@ -44,6 +44,7 @@ from src.envs import (
|
|
44 |
HF_HOME,
|
45 |
)
|
46 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
|
47 |
from src.submission.submit import add_new_eval
|
48 |
from src.voting.vote_system import VoteManager, run_scheduler
|
49 |
|
@@ -59,13 +60,17 @@ DO_FULL_INIT = True # os.getenv("LEADERBOARD_FULL_INIT", "True") == "True"
|
|
59 |
NEW_DATA_ON_LEADERBOARD = True
|
60 |
LEADERBOARD_DF = None
|
61 |
|
|
|
|
|
|
|
|
|
62 |
def restart_space():
|
63 |
logging.info(f"Restarting space with repo ID: {REPO_ID}")
|
64 |
try:
|
65 |
# Check if new data is pending and download if necessary
|
66 |
if NEW_DATA_ON_LEADERBOARD:
|
67 |
logging.info("Fetching latest leaderboard data before restart.")
|
68 |
-
|
69 |
|
70 |
# Now restart the space
|
71 |
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
@@ -109,37 +114,6 @@ def download_dataset(repo_id, local_dir, repo_type="dataset", max_attempts=3, ba
|
|
109 |
attempt += 1
|
110 |
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
111 |
|
112 |
-
def get_latest_data_leaderboard(leaderboard_initial_df=None):
|
113 |
-
global NEW_DATA_ON_LEADERBOARD
|
114 |
-
global LEADERBOARD_DF
|
115 |
-
if NEW_DATA_ON_LEADERBOARD:
|
116 |
-
logging.info("Leaderboard updated at reload!")
|
117 |
-
try:
|
118 |
-
leaderboard_dataset = datasets.load_dataset(
|
119 |
-
AGGREGATED_REPO,
|
120 |
-
"default",
|
121 |
-
split="train",
|
122 |
-
cache_dir=HF_HOME,
|
123 |
-
download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD, # Always download fresh data
|
124 |
-
verification_mode="no_checks"
|
125 |
-
)
|
126 |
-
LEADERBOARD_DF = get_leaderboard_df(
|
127 |
-
leaderboard_dataset=leaderboard_dataset,
|
128 |
-
cols=COLS,
|
129 |
-
benchmark_cols=BENCHMARK_COLS,
|
130 |
-
)
|
131 |
-
logging.info("Leaderboard dataset successfully downloaded.")
|
132 |
-
except Exception as e:
|
133 |
-
logging.error(f"Failed to download leaderboard dataset: {e}")
|
134 |
-
return
|
135 |
-
|
136 |
-
# Reset the flag after successful download
|
137 |
-
NEW_DATA_ON_LEADERBOARD = False
|
138 |
-
else:
|
139 |
-
LEADERBOARD_DF = leaderboard_initial_df
|
140 |
-
logging.info("Using cached leaderboard dataset.")
|
141 |
-
return LEADERBOARD_DF
|
142 |
-
|
143 |
|
144 |
def get_latest_data_queue():
|
145 |
eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
@@ -147,8 +121,7 @@ def get_latest_data_queue():
|
|
147 |
|
148 |
def init_space():
|
149 |
"""Initializes the application space, loading only necessary data."""
|
150 |
-
|
151 |
-
NEW_DATA_ON_LEADERBOARD = True # Ensure new data is always pulled on restart
|
152 |
|
153 |
if DO_FULL_INIT:
|
154 |
# These downloads only occur on full initialization
|
@@ -158,18 +131,14 @@ def init_space():
|
|
158 |
except Exception:
|
159 |
restart_space()
|
160 |
|
161 |
-
# Always redownload the leaderboard DataFrame
|
162 |
-
global LEADERBOARD_DF
|
163 |
-
LEADERBOARD_DF = get_latest_data_leaderboard()
|
164 |
-
|
165 |
# Evaluation queue DataFrame retrieval is independent of initialization detail level
|
166 |
eval_queue_dfs = get_latest_data_queue()
|
167 |
-
|
168 |
-
return LEADERBOARD_DF, eval_queue_dfs
|
169 |
|
170 |
# Initialize VoteManager
|
171 |
vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
|
172 |
|
|
|
173 |
|
174 |
# Schedule the upload_votes method to run every 15 minutes
|
175 |
schedule.every(15).minutes.do(vote_manager.upload_votes)
|
@@ -180,10 +149,11 @@ scheduler_thread.start()
|
|
180 |
|
181 |
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
182 |
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
183 |
-
|
184 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
|
185 |
|
186 |
|
|
|
187 |
# Function to check if a user is logged in
|
188 |
def check_login(profile: gr.OAuthProfile | None) -> bool:
|
189 |
if profile is None:
|
@@ -193,8 +163,11 @@ def check_login(profile: gr.OAuthProfile | None) -> bool:
|
|
193 |
def init_leaderboard(dataframe):
|
194 |
if dataframe is None or dataframe.empty:
|
195 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
|
|
|
196 |
return Leaderboard(
|
197 |
-
value=
|
198 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
199 |
select_columns=SelectColumns(
|
200 |
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
@@ -236,7 +209,7 @@ with main_block:
|
|
236 |
|
237 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
238 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
239 |
-
leaderboard =
|
240 |
|
241 |
with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
|
242 |
with gr.Column():
|
@@ -425,7 +398,7 @@ with main_block:
|
|
425 |
show_copy_button=True,
|
426 |
)
|
427 |
|
428 |
-
main_block.load(fn=
|
429 |
leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
430 |
pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
|
431 |
|
@@ -466,14 +439,9 @@ webhooks_server = enable_space_ci_and_return_server(ui=main_block)
|
|
466 |
def update_leaderboard(payload: WebhookPayload) -> None:
|
467 |
"""Redownloads the leaderboard dataset each time it updates"""
|
468 |
if payload.repo.type == "dataset" and payload.event.action == "update":
|
469 |
-
global NEW_DATA_ON_LEADERBOARD
|
470 |
logging.info("New data detected, downloading updated leaderboard dataset.")
|
471 |
-
|
472 |
-
# Mark the flag for new data
|
473 |
-
NEW_DATA_ON_LEADERBOARD = True
|
474 |
-
|
475 |
# Now actually download the latest data immediately
|
476 |
-
|
477 |
|
478 |
# The below code is not used at the moment, as we can manage the queue file locally
|
479 |
LAST_UPDATE_QUEUE = datetime.datetime.now()
|
|
|
44 |
HF_HOME,
|
45 |
)
|
46 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
47 |
+
from src.leaderboard.data import LeaderboardData
|
48 |
from src.submission.submit import add_new_eval
|
49 |
from src.voting.vote_system import VoteManager, run_scheduler
|
50 |
|
|
|
60 |
NEW_DATA_ON_LEADERBOARD = True
|
61 |
LEADERBOARD_DF = None
|
62 |
|
63 |
+
|
64 |
+
leaderboard_data = LeaderboardData()
|
65 |
+
|
66 |
+
|
67 |
def restart_space():
|
68 |
logging.info(f"Restarting space with repo ID: {REPO_ID}")
|
69 |
try:
|
70 |
# Check if new data is pending and download if necessary
|
71 |
if NEW_DATA_ON_LEADERBOARD:
|
72 |
logging.info("Fetching latest leaderboard data before restart.")
|
73 |
+
leaderboard_data.update()
|
74 |
|
75 |
# Now restart the space
|
76 |
API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)
|
|
|
114 |
attempt += 1
|
115 |
raise Exception(f"Failed to download {repo_id} after {max_attempts} attempts")
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
def get_latest_data_queue():
|
119 |
eval_queue_dfs = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
|
|
121 |
|
122 |
def init_space():
|
123 |
"""Initializes the application space, loading only necessary data."""
|
124 |
+
leaderboard_data.update()
|
|
|
125 |
|
126 |
if DO_FULL_INIT:
|
127 |
# These downloads only occur on full initialization
|
|
|
131 |
except Exception:
|
132 |
restart_space()
|
133 |
|
|
|
|
|
|
|
|
|
134 |
# Evaluation queue DataFrame retrieval is independent of initialization detail level
|
135 |
eval_queue_dfs = get_latest_data_queue()
|
136 |
+
return eval_queue_dfs
|
|
|
137 |
|
138 |
# Initialize VoteManager
|
139 |
vote_manager = VoteManager(VOTES_PATH, EVAL_REQUESTS_PATH, VOTES_REPO)
|
140 |
|
141 |
+
schedule.every(15).seconds.do(leaderboard_data.update)
|
142 |
|
143 |
# Schedule the upload_votes method to run every 15 minutes
|
144 |
schedule.every(15).minutes.do(vote_manager.upload_votes)
|
|
|
149 |
|
150 |
# Calls the init_space function with the `full_init` parameter determined by the `do_full_init` variable.
|
151 |
# This initializes various DataFrames used throughout the application, with the level of initialization detail controlled by the `do_full_init` flag.
|
152 |
+
eval_queue_dfs = init_space()
|
153 |
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = eval_queue_dfs
|
154 |
|
155 |
|
156 |
+
<<<<<<< Updated upstream
|
157 |
# Function to check if a user is logged in
|
158 |
def check_login(profile: gr.OAuthProfile | None) -> bool:
|
159 |
if profile is None:
|
|
|
163 |
def init_leaderboard(dataframe):
|
164 |
if dataframe is None or dataframe.empty:
|
165 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
166 |
+
=======
|
167 |
+
def make_leaderboard(leaderboard_data: LeaderboardData):
|
168 |
+
>>>>>>> Stashed changes
|
169 |
return Leaderboard(
|
170 |
+
value=leaderboard_data.get_data(),
|
171 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
172 |
select_columns=SelectColumns(
|
173 |
default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
|
|
|
209 |
|
210 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
211 |
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
212 |
+
leaderboard = make_leaderboard(leaderboard_data)
|
213 |
|
214 |
with gr.TabItem("🚀 Submit ", elem_id="llm-benchmark-tab-table", id=5):
|
215 |
with gr.Column():
|
|
|
398 |
show_copy_button=True,
|
399 |
)
|
400 |
|
401 |
+
# main_block.load(fn=leaderboard_data.get_data, inputs=[leaderboard], outputs=[leaderboard])
|
402 |
leaderboard.change(fn=get_latest_data_queue, inputs=None, outputs=[finished_eval_table, running_eval_table, pending_eval_table])
|
403 |
pending_eval_table.change(fn=vote_manager.create_request_vote_df, inputs=[pending_eval_table], outputs=[pending_eval_table_votes])
|
404 |
|
|
|
439 |
def update_leaderboard(payload: WebhookPayload) -> None:
|
440 |
"""Redownloads the leaderboard dataset each time it updates"""
|
441 |
if payload.repo.type == "dataset" and payload.event.action == "update":
|
|
|
442 |
logging.info("New data detected, downloading updated leaderboard dataset.")
|
|
|
|
|
|
|
|
|
443 |
# Now actually download the latest data immediately
|
444 |
+
leaderboard_data.update()
|
445 |
|
446 |
# The below code is not used at the moment, as we can manage the queue file locally
|
447 |
LAST_UPDATE_QUEUE = datetime.datetime.now()
|
src/leaderboard/data.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import datasets
|
3 |
+
from src.populate import get_leaderboard_df
|
4 |
+
from src.envs import AGGREGATED_REPO, HF_HOME
|
5 |
+
from src.display.utils import COLS, BENCHMARK_COLS
|
6 |
+
|
7 |
+
class LeaderboardData:
|
8 |
+
def __init__(self):
|
9 |
+
self.__data = None
|
10 |
+
self.aggregated_repo = AGGREGATED_REPO # Replace with your actual repo
|
11 |
+
self.hf_home = HF_HOME # Replace with your actual HF_HOME
|
12 |
+
self.cols = COLS # Replace with your actual COLS
|
13 |
+
self.benchmark_cols = BENCHMARK_COLS # Replace with your actual BENCHMARK_COLS
|
14 |
+
|
15 |
+
def __update(self):
|
16 |
+
"""Internal method to download and process leaderboard data."""
|
17 |
+
try:
|
18 |
+
leaderboard_dataset = datasets.load_dataset(
|
19 |
+
self.aggregated_repo,
|
20 |
+
"default",
|
21 |
+
split="train",
|
22 |
+
cache_dir=self.hf_home,
|
23 |
+
download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD,
|
24 |
+
verification_mode="no_checks"
|
25 |
+
)
|
26 |
+
|
27 |
+
self.__data = get_leaderboard_df(
|
28 |
+
leaderboard_dataset=leaderboard_dataset,
|
29 |
+
cols=self.cols,
|
30 |
+
benchmark_cols=self.benchmark_cols,
|
31 |
+
)
|
32 |
+
|
33 |
+
logging.info("Leaderboard dataset successfully downloaded.")
|
34 |
+
return self.__data
|
35 |
+
|
36 |
+
except Exception as e:
|
37 |
+
logging.error(f"Failed to download leaderboard dataset: {e}")
|
38 |
+
return None
|
39 |
+
|
40 |
+
def update(self):
|
41 |
+
"""Public method to trigger leaderboard data update."""
|
42 |
+
logging.info("Leaderboard updated at reload!")
|
43 |
+
return self.__update()
|
44 |
+
|
45 |
+
def get_data(self):
|
46 |
+
"""Returns the current leaderboard data."""
|
47 |
+
return self.__data
|
48 |
+
|
49 |
+
|
50 |
+
# def get_latest_data_leaderboard(leaderboard_initial_df=None):
|
51 |
+
# global NEW_DATA_ON_LEADERBOARD
|
52 |
+
# global LEADERBOARD_DF
|
53 |
+
# if NEW_DATA_ON_LEADERBOARD:
|
54 |
+
# logging.info("Leaderboard updated at reload!")
|
55 |
+
# try:
|
56 |
+
# leaderboard_dataset = datasets.load_dataset(
|
57 |
+
# AGGREGATED_REPO,
|
58 |
+
# "default",
|
59 |
+
# split="train",
|
60 |
+
# cache_dir=HF_HOME,
|
61 |
+
# download_mode=datasets.DownloadMode.FORCE_REDOWNLOAD, # Always download fresh data
|
62 |
+
# verification_mode="no_checks"
|
63 |
+
# )
|
64 |
+
# LEADERBOARD_DF = get_leaderboard_df(
|
65 |
+
# leaderboard_dataset=leaderboard_dataset,
|
66 |
+
# cols=COLS,
|
67 |
+
# benchmark_cols=BENCHMARK_COLS,
|
68 |
+
# )
|
69 |
+
# logging.info("Leaderboard dataset successfully downloaded.")
|
70 |
+
# except Exception as e:
|
71 |
+
# logging.error(f"Failed to download leaderboard dataset: {e}")
|
72 |
+
# return
|
73 |
+
|
74 |
+
# # Reset the flag after successful download
|
75 |
+
# NEW_DATA_ON_LEADERBOARD = False
|
76 |
+
# else:
|
77 |
+
# LEADERBOARD_DF = leaderboard_initial_df
|
78 |
+
# logging.info("Using cached leaderboard dataset.")
|
79 |
+
# return LEADERBOARD_DF
|