import os import json import datetime from email.utils import parseaddr import gradio as gr import pandas as pd import numpy as np from datasets import load_dataset, DatasetDict from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import HfApi # InfoStrings from scorer import question_scorer from content import ( format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, model_hyperlink, ) TOKEN = os.environ.get("TOKEN", None) OWNER = "stemdataset" INTERNAL_DATA_DATASET = f"{OWNER}/STEM-Labels-Private" SUBMISSION_DATASET = f"{OWNER}/submissions_internal" CONTACT_DATASET = f"{OWNER}/contact_info" RESULTS_DATASET = f"{OWNER}/results" LEADERBOARD_PATH = f"{OWNER}/stem-leaderboard" api = HfApi() os.makedirs("scored", exist_ok=True) # Display the results eval_results = load_dataset( RESULTS_DATASET, token=TOKEN, download_mode="force_redownload", verification_mode="no_checks", ) contact_infos = load_dataset( CONTACT_DATASET, token=TOKEN, download_mode="force_redownload", verification_mode="no_checks", ) def get_dataframe_from_results(eval_results: DatasetDict, split): local_df = eval_results[split] local_df = local_df.map( lambda row: {"model": model_hyperlink(row["url"], row["model"])} ) local_df = local_df.remove_columns(["url"]) local_df = local_df.rename_column("model", "Model Name") local_df = local_df.rename_column("model_family", "Model Family") local_df = local_df.rename_column("average", "Average") local_df = local_df.rename_column("science", "Science") local_df = local_df.rename_column("technology", "Technology") local_df = local_df.rename_column("engineering", "Engineering") local_df = local_df.rename_column("math", "Math") local_df = local_df.rename_column("organisation", "Organisation") local_df = local_df.rename_column("submit_date", "Submit Date") df = pd.DataFrame(local_df) df = df[[ "Model Name", "Model Family", "Science", "Technology", "Engineering", "Math", "Average", "Organisation", "Submit Date", ]] df = df.sort_values(by=["Average"], ascending=False) numeric_cols = ["Science", "Technology", "Engineering", "Math", "Average"] df[numeric_cols] = df[numeric_cols].round(decimals=1) for col in numeric_cols: df[col] = df[col].apply(lambda x: f"{x:.1f}") return df eval_dataframe_test = get_dataframe_from_results( eval_results=eval_results, split="basic" ) # Gold answers gold_dataset = load_dataset(INTERNAL_DATA_DATASET, token=TOKEN)["labels"] def restart_space(): api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN) TYPES = ["markdown", "number", "number", "number", "number", "str", "str"] def calc_test_acc(preds: list[int]) -> dict[str, float]: tmp_accs = { "science": [0, 0], "technology": [0, 0], "engineer": [0, 0], "math": [0, 0], } labels = gold_dataset for pred, label in zip(preds, labels): subject = label["subject"] tmp_accs[subject][1] += 1 if pred == label["answer_idx"]: tmp_accs[subject][0] += 1 accs = {k: v[0] / v[1] for k, v in tmp_accs.items()} accs["average"] = np.mean(list(accs.values())) accs = {k: round(v * 100, 1) for k, v in accs.items()} return accs def add_new_eval( val_or_test: str, model: str, model_family: str, url: str, path_to_file: gr.File, organisation: str, mail: str, ): curr_timestamp = datetime.datetime.today() # Very basic email parsing _, parsed_mail = parseaddr(mail) if not "@" in parsed_mail: return format_warning("Please provide a valid email adress.") if model == "": return format_warning("Please provide a model name.") if model_family == "": return format_warning("Please provide a model family.") print( json.dumps( { "val_or_test": val_or_test, "model": model, "model_family": model_family, "url": url, "path_to_file": path_to_file, "organisation": organisation, "mail": mail, }, indent=2, ) ) print("Adding new eval") # Check if the combination model/org already exists and prints a warning message if yes if model.lower() in set( [m.lower() for m in eval_results["basic"]["model"]] ) and organisation.lower() in set( [l.lower() for l in eval_results["basic"]["organisation"]] ): return format_warning("This model has been already submitted.") if path_to_file is None: return format_warning("Please attach a file.") # Save submitted file api.upload_file( repo_id=SUBMISSION_DATASET, path_or_fileobj=path_to_file.name, path_in_repo=f"{organisation}/{model}/{val_or_test}_raw_{curr_timestamp}.txt", repo_type="dataset", token=TOKEN, ) # Compute score file_path = path_to_file.name with open(f"scored/{organisation}_{model}.json", "w") as scored_file: with open(file_path, "r") as f: preds = [] for ix, line in enumerate(f): try: pred_idx = int(line.strip()) except Exception: return format_error( f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file." ) preds.append(pred_idx) stem_scores = calc_test_acc(preds) scored_file.write(json.dumps(stem_scores, indent=2)) # Save scored file api.upload_file( repo_id=SUBMISSION_DATASET, path_or_fileobj=f"scored/{organisation}_{model}.json", path_in_repo=f"{organisation}/{model}/{val_or_test}_scored_{curr_timestamp}.json", repo_type="dataset", token=TOKEN, ) # Actual submission eval_entry = { "model": model, "model_family": model_family, "url": url, "organisation": organisation, "submit_date": "\n".join(str(curr_timestamp).split(" ")), "science": stem_scores["science"], "technology": stem_scores["technology"], "engineering": stem_scores["engineer"], "math": stem_scores["math"], "average": stem_scores["average"], } eval_results["basic"] = eval_results["basic"].add_item(eval_entry) print(eval_results) eval_results.push_to_hub(RESULTS_DATASET, token=TOKEN) contact_info = { "model": model, "model_family": model_family, "url": url, "organisation": organisation, "mail": mail, "submit_date": "\n".join(str(curr_timestamp).split(" ")), } contact_infos["basic"] = contact_infos["basic"].add_item(contact_info) contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN) return format_log( f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed" ) def refresh(): eval_results = load_dataset( RESULTS_DATASET, token=TOKEN, download_mode="force_redownload", verification_mode="no_checks", ) eval_dataframe_test = get_dataframe_from_results( eval_results=eval_results, split="basic" ) return eval_dataframe_test def upload_file(files): file_paths = [file.name for file in files] return file_paths demo = gr.Blocks() with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tab("Results: Test"): leaderboard_table_test = gr.components.Dataframe( value=eval_dataframe_test, datatype=TYPES, interactive=False, wrap=True, ) refresh_button = gr.Button("Refresh") refresh_button.click( refresh, inputs=[], outputs=[ leaderboard_table_test, ], ) with gr.Accordion("Submit a new model for evaluation"): with gr.Row(): with gr.Column(): level_of_test = gr.Radio(["test"], value="test", label="Split") model_name_textbox = gr.Textbox(label="Model name") model_family_textbox = gr.Textbox(label="Model family") url_textbox = gr.Textbox(label="Url to model information") with gr.Column(): organisation = gr.Textbox(label="Organisation") mail = gr.Textbox( label="Contact email (will be stored privately, & used if there is an issue with your submission)" ) file_output = gr.File() submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, [ level_of_test, model_name_textbox, model_family_textbox, url_textbox, file_output, organisation, mail, ], submission_result, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=3600) scheduler.start() demo.launch(debug=True, server_name="0.0.0.0")