Update app.py
Browse files
app.py
CHANGED
@@ -35,8 +35,8 @@ os.makedirs("scored", exist_ok=True)
|
|
35 |
|
36 |
# # Display the results
|
37 |
eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
|
38 |
-
def get_dataframe_from_results(eval_results, split):
|
39 |
-
local_df = eval_results[split]
|
40 |
local_df = local_df.remove_columns(["Mail"])
|
41 |
df = pd.DataFrame(local_df)
|
42 |
df = df.sort_values(by=["Final Pass Rate"], ascending=False)
|
@@ -45,9 +45,10 @@ def get_dataframe_from_results(eval_results, split):
|
|
45 |
return df
|
46 |
|
47 |
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
51 |
|
52 |
|
53 |
# def restart_space():
|
@@ -67,6 +68,7 @@ def add_new_eval(
|
|
67 |
val_or_test: str,
|
68 |
eval_mode: str,
|
69 |
model: str,
|
|
|
70 |
planning_strategy: str,
|
71 |
organization: str,
|
72 |
mail: str,
|
@@ -86,7 +88,7 @@ def add_new_eval(
|
|
86 |
api.upload_file(
|
87 |
repo_id=RESULTS_DATASET,
|
88 |
path_or_fileobj=path_to_file.name,
|
89 |
-
path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
|
90 |
repo_type="dataset",
|
91 |
token=TOKEN
|
92 |
)
|
@@ -94,14 +96,14 @@ def add_new_eval(
|
|
94 |
# Compute score
|
95 |
file_path = path_to_file.name
|
96 |
result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
|
97 |
-
with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl", "w") as scored_file:
|
98 |
scored_file.write(json.dumps(result) + "\n")
|
99 |
|
100 |
# Save scored file
|
101 |
api.upload_file(
|
102 |
repo_id=RESULTS_DATASET,
|
103 |
-
path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl",
|
104 |
-
path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
|
105 |
repo_type="dataset",
|
106 |
token=TOKEN
|
107 |
)
|
@@ -109,6 +111,7 @@ def add_new_eval(
|
|
109 |
# Actual submission
|
110 |
eval_entry = {
|
111 |
"Model": model,
|
|
|
112 |
"Planning Strategy": planning_strategy,
|
113 |
"Organization": organization,
|
114 |
"Mail": mail,
|
@@ -119,21 +122,23 @@ def add_new_eval(
|
|
119 |
"Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
|
120 |
"Final Pass Rate":result['Final Pass Rate']
|
121 |
}
|
122 |
-
|
123 |
-
eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
|
124 |
|
125 |
print(eval_results)
|
126 |
|
127 |
eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN)
|
128 |
|
129 |
-
return format_log(f"Model {model} submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")
|
130 |
|
131 |
|
132 |
def refresh():
|
133 |
eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
137 |
|
138 |
# def upload_file(files):
|
139 |
# file_paths = [file.name for file in files]
|
@@ -145,13 +150,22 @@ with demo:
|
|
145 |
gr.HTML(TITLE)
|
146 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
147 |
|
148 |
-
with gr.Tab("Results: Validation"):
|
149 |
-
|
150 |
-
value=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
)
|
152 |
-
with gr.Tab("Results: Test"):
|
153 |
-
|
154 |
-
value=
|
155 |
)
|
156 |
|
157 |
refresh_button = gr.Button("Refresh")
|
@@ -159,8 +173,10 @@ with demo:
|
|
159 |
refresh,
|
160 |
inputs=[],
|
161 |
outputs=[
|
162 |
-
|
163 |
-
|
|
|
|
|
164 |
],
|
165 |
)
|
166 |
with gr.Accordion("Submit a new file for evaluation"):
|
@@ -169,6 +185,7 @@ with demo:
|
|
169 |
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
|
170 |
eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
|
171 |
model = gr.Textbox(label="Foundation Model")
|
|
|
172 |
planning_strategy = gr.Textbox(label="Planning Strategy")
|
173 |
with gr.Column():
|
174 |
organization = gr.Textbox(label="Organization")
|
@@ -184,6 +201,7 @@ with demo:
|
|
184 |
level_of_test,
|
185 |
eval_mode,
|
186 |
model,
|
|
|
187 |
planning_strategy,
|
188 |
organization,
|
189 |
mail,
|
@@ -192,8 +210,6 @@ with demo:
|
|
192 |
submission_result,
|
193 |
)
|
194 |
|
195 |
-
# scheduler = BackgroundScheduler()
|
196 |
-
# scheduler.add_job(restart_space, "interval", seconds=3600)
|
197 |
-
# scheduler.start()
|
198 |
demo.launch(debug=True)
|
199 |
|
|
|
|
35 |
|
36 |
# # Display the results
|
37 |
eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
|
38 |
+
def get_dataframe_from_results(eval_results, split, mode):
|
39 |
+
local_df = eval_results[f'{split}_{mode}']
|
40 |
local_df = local_df.remove_columns(["Mail"])
|
41 |
df = pd.DataFrame(local_df)
|
42 |
df = df.sort_values(by=["Final Pass Rate"], ascending=False)
|
|
|
45 |
return df
|
46 |
|
47 |
|
48 |
+
eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage')
|
49 |
+
eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning')
|
50 |
+
eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage')
|
51 |
+
eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
|
52 |
|
53 |
|
54 |
# def restart_space():
|
|
|
68 |
val_or_test: str,
|
69 |
eval_mode: str,
|
70 |
model: str,
|
71 |
+
tooluse_strategy: str,
|
72 |
planning_strategy: str,
|
73 |
organization: str,
|
74 |
mail: str,
|
|
|
88 |
api.upload_file(
|
89 |
repo_id=RESULTS_DATASET,
|
90 |
path_or_fileobj=path_to_file.name,
|
91 |
+
path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
|
92 |
repo_type="dataset",
|
93 |
token=TOKEN
|
94 |
)
|
|
|
96 |
# Compute score
|
97 |
file_path = path_to_file.name
|
98 |
result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
|
99 |
+
with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl", "w") as scored_file:
|
100 |
scored_file.write(json.dumps(result) + "\n")
|
101 |
|
102 |
# Save scored file
|
103 |
api.upload_file(
|
104 |
repo_id=RESULTS_DATASET,
|
105 |
+
path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl",
|
106 |
+
path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
|
107 |
repo_type="dataset",
|
108 |
token=TOKEN
|
109 |
)
|
|
|
111 |
# Actual submission
|
112 |
eval_entry = {
|
113 |
"Model": model,
|
114 |
+
"Tool-use Strategy": tooluse_strategy,
|
115 |
"Planning Strategy": planning_strategy,
|
116 |
"Organization": organization,
|
117 |
"Mail": mail,
|
|
|
122 |
"Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
|
123 |
"Final Pass Rate":result['Final Pass Rate']
|
124 |
}
|
125 |
+
eval_mode = eval_mode.replace('-','')
|
126 |
+
eval_results[f'{val_or_test}_{eval_mode}'] = eval_results[f'{val_or_test}_{eval_mode}'].add_item(eval_entry)
|
127 |
|
128 |
print(eval_results)
|
129 |
|
130 |
eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN)
|
131 |
|
132 |
+
return format_log(f"Model: {model} | Tool-use Strategy: {tooluse_strategy} | Planning Strategy: {planning_strategy} | submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed (Validation ~2mins, Test ~7mins).")
|
133 |
|
134 |
|
135 |
def refresh():
|
136 |
eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
|
137 |
+
eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage')
|
138 |
+
eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning')
|
139 |
+
eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage')
|
140 |
+
eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
|
141 |
+
return eval_dataframe_val_twostage, eval_dataframe_val_soleplanning, eval_dataframe_test_twostage, eval_dataframe_test_soleplanning
|
142 |
|
143 |
# def upload_file(files):
|
144 |
# file_paths = [file.name for file in files]
|
|
|
150 |
gr.HTML(TITLE)
|
151 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
152 |
|
153 |
+
with gr.Tab("Results: Validation | Two-Stage "):
|
154 |
+
leaderboard_table_val_twostage = gr.components.Dataframe(
|
155 |
+
value=eval_dataframe_val_twostage, interactive=False,
|
156 |
+
)
|
157 |
+
with gr.Tab("Results: Validation | Sole-Planning"):
|
158 |
+
leaderboard_table_val_soleplanning = gr.components.Dataframe(
|
159 |
+
value=eval_dataframe_val_soleplanning, interactive=False,
|
160 |
+
)
|
161 |
+
|
162 |
+
with gr.Tab("Results: Test | Two-Stage "):
|
163 |
+
leaderboard_table_test_twostage = gr.components.Dataframe(
|
164 |
+
value=eval_dataframe_test_twostage, interactive=False,
|
165 |
)
|
166 |
+
with gr.Tab("Results: Test | Sole-Planning"):
|
167 |
+
leaderboard_table_test_soleplanning = gr.components.Dataframe(
|
168 |
+
value=eval_dataframe_test_soleplanning, interactive=False,
|
169 |
)
|
170 |
|
171 |
refresh_button = gr.Button("Refresh")
|
|
|
173 |
refresh,
|
174 |
inputs=[],
|
175 |
outputs=[
|
176 |
+
leaderboard_table_val_twostage,
|
177 |
+
leaderboard_table_val_soleplanning,
|
178 |
+
leaderboard_table_test_twostage,
|
179 |
+
leaderboard_table_test_soleplanning,
|
180 |
],
|
181 |
)
|
182 |
with gr.Accordion("Submit a new file for evaluation"):
|
|
|
185 |
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
|
186 |
eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
|
187 |
model = gr.Textbox(label="Foundation Model")
|
188 |
+
tooluse_strategy = gr.Textbox(label="Tool-use Strategy")
|
189 |
planning_strategy = gr.Textbox(label="Planning Strategy")
|
190 |
with gr.Column():
|
191 |
organization = gr.Textbox(label="Organization")
|
|
|
201 |
level_of_test,
|
202 |
eval_mode,
|
203 |
model,
|
204 |
+
tooluse_strategy,
|
205 |
planning_strategy,
|
206 |
organization,
|
207 |
mail,
|
|
|
210 |
submission_result,
|
211 |
)
|
212 |
|
|
|
|
|
|
|
213 |
demo.launch(debug=True)
|
214 |
|
215 |
+
|