Spaces:

osunlp
/

TravelPlannerLeaderboard

Running

File size: 6,864 Bytes

import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard/evaluation")))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard")))
os.chdir(os.path.dirname(os.path.abspath(__file__)))
os.environ['CURL_CA_BUNDLE'] = ''
import json
import datetime
from email.utils import parseaddr

import gradio as gr
import pandas as pd
import numpy as np

from datasets import load_dataset
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import HfApi

# InfoStrings
# from scorer import question_scorer
from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
from eval import eval_score

TOKEN = os.environ.get("TOKEN", None)
OWNER="osunlp"
DATA_DATASET = f"{OWNER}/TravelBench"
EVAL_DATASET = f"{OWNER}/TravelBenchEval"
RESULTS_DATASET = f"{OWNER}/TravelBenchPublicResults"

api = HfApi()

# 'scores' = "2024"

os.makedirs("scored", exist_ok=True)

# # Display the results
eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
def get_dataframe_from_results(eval_results, split):
    local_df = eval_results[split]
    local_df = local_df.remove_columns(["Mail"])
    df = pd.DataFrame(local_df)
    df = df.sort_values(by=["Final Pass Rate"], ascending=False)
    numeric_cols = [c for c in local_df.column_names if "Rate" in c]
    df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
    return df


eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")



# def restart_space():
#     api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)


def load_line_json_data(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f.read().strip().split('\n'):
            unit = json.loads(line)
            data.append(unit)
    return data


def add_new_eval(
    val_or_test: str,
    eval_mode: str,
    model: str,
    planning_strategy: str,
    organization: str,
    mail: str,
    path_to_file: str,
):
    # Very basic email parsing
    _, parsed_mail = parseaddr(mail)
    if not "@" in parsed_mail:
        return format_warning("Please provide a valid email adress.")

    print("Adding new eval")
    
    if path_to_file is None:
        return format_warning("Please attach a file.")

    # Save submitted file
    api.upload_file(
        repo_id=RESULTS_DATASET, 
        path_or_fileobj=path_to_file.name, 
        path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
        repo_type="dataset", 
        token=TOKEN
    )

    # Compute score
    file_path = path_to_file.name     
    result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)   
    with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl", "w") as scored_file:
        scored_file.write(json.dumps(result) + "\n")
    
    # Save scored file
    api.upload_file(
        repo_id=RESULTS_DATASET, 
        path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl",
        path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl", 
        repo_type="dataset", 
        token=TOKEN
    )

    # Actual submission
    eval_entry = {
        "Model": model,
        "Planning Strategy": planning_strategy,
        "Organization": organization,
        "Mail": mail,
        "Delivery Rate": result['Delivery Rate'],
        "Commonsense Constraint Micro Pass Rate":result['Commonsense Constraint Micro Pass Rate'],
        "Commonsense Constraint Macro Pass Rate":result['Commonsense Constraint Macro Pass Rate'],
        "Hard Constraint Micro Pass Rate":result['Hard Constraint Micro Pass Rate'],
        "Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
        "Final Pass Rate":result['Final Pass Rate']
    }

    eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)

    print(eval_results)

    eval_results.push_to_hub(RESULTS_DATASET, config_name = 'scores', token=TOKEN)

    return format_log(f"Model {model} submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")


def refresh():
    eval_results = load_dataset(RESULTS_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
    eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
    eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
    return eval_dataframe_val, eval_dataframe_test

# def upload_file(files):
#     file_paths = [file.name for file in files]
#     return file_paths


demo = gr.Blocks()
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tab("Results: Validation"):
        leaderboard_table_val = gr.components.Dataframe(
            value=eval_dataframe_val, interactive=False,
        )
    with gr.Tab("Results: Test"):
        leaderboard_table_test = gr.components.Dataframe(
            value=eval_dataframe_test, interactive=False,
        )

    refresh_button = gr.Button("Refresh")
    refresh_button.click(
        refresh,
        inputs=[],
        outputs=[
            leaderboard_table_val,
            leaderboard_table_test,
        ],
    )
    with gr.Accordion("Submit a new file for evaluation"):
        with gr.Row():
            with gr.Column():
                level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
                eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
                model = gr.Textbox(label="Foundation Model")
                planning_strategy = gr.Textbox(label="Planning Strategy")
            with gr.Column():
                organization = gr.Textbox(label="Organization")
                mail = gr.Textbox(label="Contact email")
                file_output = gr.File()


        submit_button = gr.Button("Submit Eval")
        submission_result = gr.Markdown()
        submit_button.click(
            add_new_eval,
            [
                level_of_test,
                eval_mode,
                model,
                planning_strategy,
                organization,
                mail,
                file_output,
            ],
            submission_result,
        )

# scheduler = BackgroundScheduler()
# scheduler.add_job(restart_space, "interval", seconds=3600)
# scheduler.start()
demo.launch(debug=True)