Spaces:

osunlp
/

TravelPlannerLeaderboard

Running

App Files Files Community

hsaest commited on Jan 21

Commit

4283eb3

•

1 Parent(s): b1af7e8

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -26

app.py CHANGED Viewed

@@ -35,8 +35,8 @@ os.makedirs("scored", exist_ok=True)
 # # Display the results
 eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
-def get_dataframe_from_results(eval_results, split):
-    local_df = eval_results[split]
     local_df = local_df.remove_columns(["Mail"])
     df = pd.DataFrame(local_df)
     df = df.sort_values(by=["Final Pass Rate"], ascending=False)
@@ -45,9 +45,10 @@ def get_dataframe_from_results(eval_results, split):
     return df
-eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
-eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
 # def restart_space():
@@ -67,6 +68,7 @@ def add_new_eval(
     val_or_test: str,
     eval_mode: str,
     model: str,
     planning_strategy: str,
     organization: str,
     mail: str,
@@ -86,7 +88,7 @@ def add_new_eval(
     api.upload_file(
         repo_id=RESULTS_DATASET,
         path_or_fileobj=path_to_file.name,
-        path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
         repo_type="dataset",
         token=TOKEN
     )
@@ -94,14 +96,14 @@ def add_new_eval(
     # Compute score
     file_path = path_to_file.name
     result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
-    with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl", "w") as scored_file:
         scored_file.write(json.dumps(result) + "\n")
     # Save scored file
     api.upload_file(
         repo_id=RESULTS_DATASET,
-        path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl",
-        path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
         repo_type="dataset",
         token=TOKEN
     )
@@ -109,6 +111,7 @@ def add_new_eval(
     # Actual submission
     eval_entry = {
         "Model": model,
         "Planning Strategy": planning_strategy,
         "Organization": organization,
         "Mail": mail,
@@ -119,21 +122,23 @@ def add_new_eval(
         "Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
         "Final Pass Rate":result['Final Pass Rate']
     }
-    eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
     print(eval_results)
     eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN)
-    return format_log(f"Model {model} submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")
 def refresh():
     eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
-    eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
-    eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
-    return eval_dataframe_val, eval_dataframe_test
 # def upload_file(files):
 #     file_paths = [file.name for file in files]
@@ -145,13 +150,22 @@ with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tab("Results: Validation"):
-        leaderboard_table_val = gr.components.Dataframe(
-            value=eval_dataframe_val, interactive=False,
         )
-    with gr.Tab("Results: Test"):
-        leaderboard_table_test = gr.components.Dataframe(
-            value=eval_dataframe_test, interactive=False,
         )
     refresh_button = gr.Button("Refresh")
@@ -159,8 +173,10 @@ with demo:
         refresh,
         inputs=[],
         outputs=[
-            leaderboard_table_val,
-            leaderboard_table_test,
         ],
     )
     with gr.Accordion("Submit a new file for evaluation"):
@@ -169,6 +185,7 @@ with demo:
                 level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
                 eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
                 model = gr.Textbox(label="Foundation Model")
                 planning_strategy = gr.Textbox(label="Planning Strategy")
             with gr.Column():
                 organization = gr.Textbox(label="Organization")
@@ -184,6 +201,7 @@ with demo:
                 level_of_test,
                 eval_mode,
                 model,
                 planning_strategy,
                 organization,
                 mail,
@@ -192,8 +210,6 @@ with demo:
             submission_result,
         )
-# scheduler = BackgroundScheduler()
-# scheduler.add_job(restart_space, "interval", seconds=3600)
-# scheduler.start()
 demo.launch(debug=True)

 # # Display the results
 eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+def get_dataframe_from_results(eval_results, split, mode):
+    local_df = eval_results[f'{split}_{mode}']
     local_df = local_df.remove_columns(["Mail"])
     df = pd.DataFrame(local_df)
     df = df.sort_values(by=["Final Pass Rate"], ascending=False)
     return df
+eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage')
+eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning')
+eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage')
+eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
 # def restart_space():
     val_or_test: str,
     eval_mode: str,
     model: str,
+    tooluse_strategy: str,
     planning_strategy: str,
     organization: str,
     mail: str,
     api.upload_file(
         repo_id=RESULTS_DATASET,
         path_or_fileobj=path_to_file.name,
+        path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
         repo_type="dataset",
         token=TOKEN
     )
     # Compute score
     file_path = path_to_file.name
     result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
+    with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl", "w") as scored_file:
         scored_file.write(json.dumps(result) + "\n")
     # Save scored file
     api.upload_file(
         repo_id=RESULTS_DATASET,
+        path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl",
+        path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
         repo_type="dataset",
         token=TOKEN
     )
     # Actual submission
     eval_entry = {
         "Model": model,
+        "Tool-use Strategy": tooluse_strategy,
         "Planning Strategy": planning_strategy,
         "Organization": organization,
         "Mail": mail,
         "Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
         "Final Pass Rate":result['Final Pass Rate']
     }
+    eval_mode = eval_mode.replace('-','')
+    eval_results[f'{val_or_test}_{eval_mode}'] = eval_results[f'{val_or_test}_{eval_mode}'].add_item(eval_entry)
     print(eval_results)
     eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN)
+    return format_log(f"Model: {model} | Tool-use Strategy: {tooluse_strategy} |  Planning Strategy: {planning_strategy} | submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed (Validation ~2mins, Test ~7mins).")
 def refresh():
     eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
+    eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage')
+    eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning')
+    eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage')
+    eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
+    return eval_dataframe_val_twostage, eval_dataframe_val_soleplanning, eval_dataframe_test_twostage, eval_dataframe_test_soleplanning
 # def upload_file(files):
 #     file_paths = [file.name for file in files]
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tab("Results: Validation | Two-Stage "):
+        leaderboard_table_val_twostage = gr.components.Dataframe(
+            value=eval_dataframe_val_twostage, interactive=False,
+        )
+    with gr.Tab("Results: Validation | Sole-Planning"):
+        leaderboard_table_val_soleplanning = gr.components.Dataframe(
+            value=eval_dataframe_val_soleplanning, interactive=False,
+        )
+    with gr.Tab("Results: Test | Two-Stage "):
+        leaderboard_table_test_twostage = gr.components.Dataframe(
+            value=eval_dataframe_test_twostage, interactive=False,
         )
+    with gr.Tab("Results: Test | Sole-Planning"):
+        leaderboard_table_test_soleplanning = gr.components.Dataframe(
+            value=eval_dataframe_test_soleplanning, interactive=False,
         )
     refresh_button = gr.Button("Refresh")
         refresh,
         inputs=[],
         outputs=[
+            leaderboard_table_val_twostage,
+            leaderboard_table_val_soleplanning,
+            leaderboard_table_test_twostage,
+            leaderboard_table_test_soleplanning,
         ],
     )
     with gr.Accordion("Submit a new file for evaluation"):
                 level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
                 eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
                 model = gr.Textbox(label="Foundation Model")
+                tooluse_strategy = gr.Textbox(label="Tool-use Strategy")
                 planning_strategy = gr.Textbox(label="Planning Strategy")
             with gr.Column():
                 organization = gr.Textbox(label="Organization")
                 level_of_test,
                 eval_mode,
                 model,
+                tooluse_strategy,
                 planning_strategy,
                 organization,
                 mail,
             submission_result,
         )
 demo.launch(debug=True)