hsaest's picture
Update app.py
df45613 verified
raw
history blame
8.51 kB
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard/evaluation")))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard")))
os.chdir(os.path.dirname(os.path.abspath(__file__)))
os.environ['CURL_CA_BUNDLE'] = ''
import json
import datetime
from email.utils import parseaddr
import gradio as gr
import pandas as pd
import numpy as np
from datasets import load_dataset
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi
# InfoStrings
# from scorer import question_scorer
from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
from eval import eval_score
TOKEN = os.environ.get("TOKEN", None)
OWNER="osunlp"
DATA_DATASET = f"{OWNER}/TravelPlanner"
EVAL_DATASET = f"{OWNER}/TravelPlannerEval"
RESULTS_DATASET = f"{OWNER}/TravelPlannerPublicResults"
api = HfApi()
# 'scores' = "2024"
os.makedirs("scored", exist_ok=True)
# # Display the results
eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
def get_dataframe_from_results(eval_results, split, mode):
local_df = eval_results[f'{split}_{mode}']
local_df = local_df.remove_columns(["Mail"])
df = pd.DataFrame(local_df)
df = df.sort_values(by=["Final Pass Rate"], ascending=False)
numeric_cols = [c for c in local_df.column_names if "Rate" in c]
df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
return df
eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage')
eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning')
eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage')
eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
# def restart_space():
# api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
def load_line_json_data(filename):
data = []
with open(filename, 'r', encoding='utf-8') as f:
for line in f.read().strip().split('\n'):
unit = json.loads(line)
data.append(unit)
return data
def add_new_eval(
val_or_test: str,
eval_mode: str,
model: str,
tooluse_strategy: str,
planning_strategy: str,
organization: str,
mail: str,
path_to_file: str,
):
# Very basic email parsing
_, parsed_mail = parseaddr(mail)
if not "@" in parsed_mail:
return format_warning("Please provide a valid email adress.")
print("Adding new eval")
if path_to_file is None:
return format_warning("Please attach a file.")
# Save submitted file
api.upload_file(
repo_id=RESULTS_DATASET,
path_or_fileobj=path_to_file.name,
path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
repo_type="dataset",
token=TOKEN
)
# Compute score
file_path = path_to_file.name
result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl", "w") as scored_file:
scored_file.write(json.dumps(result) + "\n")
# Save scored file
api.upload_file(
repo_id=RESULTS_DATASET,
path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}.jsonl",
path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{tooluse_strategy}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
repo_type="dataset",
token=TOKEN
)
# Actual submission
eval_entry = {
"Model": model,
"Tool-use Strategy": tooluse_strategy,
"Planning Strategy": planning_strategy,
"Organization": organization,
"Mail": mail,
"Delivery Rate": result['Delivery Rate'],
"Commonsense Constraint Micro Pass Rate":result['Commonsense Constraint Micro Pass Rate'],
"Commonsense Constraint Macro Pass Rate":result['Commonsense Constraint Macro Pass Rate'],
"Hard Constraint Micro Pass Rate":result['Hard Constraint Micro Pass Rate'],
"Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
"Final Pass Rate":result['Final Pass Rate']
}
eval_mode = eval_mode.replace('-','')
eval_results[f'{val_or_test}_{eval_mode}'] = eval_results[f'{val_or_test}_{eval_mode}'].add_item(eval_entry)
print(eval_results)
eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN)
return format_log(f"Model: {model} | Tool-use Strategy: {tooluse_strategy} | Planning Strategy: {planning_strategy} | submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed (Validation ~2mins, Test ~7mins).")
def refresh():
eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
eval_dataframe_val_twostage = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='twostage')
eval_dataframe_val_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="validation", mode='soleplanning')
eval_dataframe_test_twostage = get_dataframe_from_results(eval_results=eval_results, split="test",mode='twostage')
eval_dataframe_test_soleplanning = get_dataframe_from_results(eval_results=eval_results, split="test",mode='soleplanning')
return eval_dataframe_val_twostage, eval_dataframe_val_soleplanning, eval_dataframe_test_twostage, eval_dataframe_test_soleplanning
# def upload_file(files):
# file_paths = [file.name for file in files]
# return file_paths
demo = gr.Blocks()
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tab("Results: Validation | Two-Stage "):
leaderboard_table_val_twostage = gr.components.Dataframe(
value=eval_dataframe_val_twostage, interactive=False,
)
with gr.Tab("Results: Validation | Sole-Planning"):
leaderboard_table_val_soleplanning = gr.components.Dataframe(
value=eval_dataframe_val_soleplanning, interactive=False,
)
with gr.Tab("Results: Test | Two-Stage "):
leaderboard_table_test_twostage = gr.components.Dataframe(
value=eval_dataframe_test_twostage, interactive=False,
)
with gr.Tab("Results: Test | Sole-Planning"):
leaderboard_table_test_soleplanning = gr.components.Dataframe(
value=eval_dataframe_test_soleplanning, interactive=False,
)
refresh_button = gr.Button("Refresh")
refresh_button.click(
refresh,
inputs=[],
outputs=[
leaderboard_table_val_twostage,
leaderboard_table_val_soleplanning,
leaderboard_table_test_twostage,
leaderboard_table_test_soleplanning,
],
)
with gr.Accordion("Submit a new file for evaluation"):
with gr.Row():
with gr.Column():
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
model = gr.Textbox(label="Foundation Model")
tooluse_strategy = gr.Textbox(label="Tool-use Strategy")
planning_strategy = gr.Textbox(label="Planning Strategy")
with gr.Column():
organization = gr.Textbox(label="Organization")
mail = gr.Textbox(label="Contact email")
file_output = gr.File()
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
submit_button.click(
add_new_eval,
[
level_of_test,
eval_mode,
model,
tooluse_strategy,
planning_strategy,
organization,
mail,
file_output,
],
submission_result,
)
demo.launch(debug=True)