Spaces:

stemdataset
/

stem-leaderboard

Runtime error

App Files Files Community

stem-leaderboard / app.py

shizue

...

55f3023 10 months ago

raw

history blame contribute delete

9.44 kB

	import os
	import json
	import datetime
	from email.utils import parseaddr

	import gradio as gr
	import pandas as pd
	import numpy as np

	from datasets import load_dataset, DatasetDict
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import HfApi

	# InfoStrings
	from scorer import question_scorer
	from content import (
	format_error,
	format_warning,
	format_log,
	TITLE,
	INTRODUCTION_TEXT,
	model_hyperlink,
	)

	TOKEN = os.environ.get("TOKEN", None)

	OWNER = "stemdataset"
	INTERNAL_DATA_DATASET = f"{OWNER}/STEM-Labels-Private"
	SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
	CONTACT_DATASET = f"{OWNER}/contact_info"
	RESULTS_DATASET = f"{OWNER}/results"
	LEADERBOARD_PATH = f"{OWNER}/stem-leaderboard"
	api = HfApi()

	os.makedirs("scored", exist_ok=True)

	# Display the results
	eval_results = load_dataset(
	RESULTS_DATASET,
	token=TOKEN,
	download_mode="force_redownload",
	verification_mode="no_checks",
	)
	contact_infos = load_dataset(
	CONTACT_DATASET,
	token=TOKEN,
	download_mode="force_redownload",
	verification_mode="no_checks",
	)


	def get_dataframe_from_results(eval_results: DatasetDict, split):
	local_df = eval_results[split]
	local_df = local_df.map(
	lambda row: {"model": model_hyperlink(row["url"], row["model"])}
	)
	local_df = local_df.remove_columns(["url"])
	local_df = local_df.rename_column("model", "Model Name")
	local_df = local_df.rename_column("model_family", "Model Family")
	local_df = local_df.rename_column("average", "Average")
	local_df = local_df.rename_column("science", "Science")
	local_df = local_df.rename_column("technology", "Technology")
	local_df = local_df.rename_column("engineering", "Engineering")
	local_df = local_df.rename_column("math", "Math")
	local_df = local_df.rename_column("organisation", "Organisation")
	local_df = local_df.rename_column("submit_date", "Submit Date")
	df = pd.DataFrame(local_df)
	df = df[[
	"Model Name",
	"Model Family",
	"Science",
	"Technology",
	"Engineering",
	"Math",
	"Average",
	"Organisation",
	"Submit Date",
	]]
	df = df.sort_values(by=["Average"], ascending=False)

	numeric_cols = ["Science", "Technology", "Engineering", "Math", "Average"]
	df[numeric_cols] = df[numeric_cols].round(decimals=1)
	for col in numeric_cols:
	df[col] = df[col].apply(lambda x: f"{x:.1f}")
	return df


	eval_dataframe_test = get_dataframe_from_results(
	eval_results=eval_results, split="basic"
	)

	# Gold answers
	gold_dataset = load_dataset(INTERNAL_DATA_DATASET, token=TOKEN)["labels"]


	def restart_space():
	api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)


	TYPES = ["markdown", "number", "number", "number", "number", "str", "str"]


	def calc_test_acc(preds: list[int]) -> dict[str, float]:
	tmp_accs = {
	"science": [0, 0],
	"technology": [0, 0],
	"engineer": [0, 0],
	"math": [0, 0],
	}
	labels = gold_dataset
	for pred, label in zip(preds, labels):
	subject = label["subject"]
	tmp_accs[subject][1] += 1
	if pred == label["answer_idx"]:
	tmp_accs[subject][0] += 1
	accs = {k: v[0] / v[1] for k, v in tmp_accs.items()}
	accs["average"] = np.mean(list(accs.values()))
	accs = {k: round(v * 100, 1) for k, v in accs.items()}
	return accs


	def add_new_eval(
	val_or_test: str,
	model: str,
	model_family: str,
	url: str,
	path_to_file: gr.File,
	organisation: str,
	mail: str,
	):
	curr_timestamp = datetime.datetime.today()
	# Very basic email parsing
	_, parsed_mail = parseaddr(mail)
	if not "@" in parsed_mail:
	return format_warning("Please provide a valid email adress.")
	if model == "":
	return format_warning("Please provide a model name.")
	if model_family == "":
	return format_warning("Please provide a model family.")
	print(
	json.dumps(
	{
	"val_or_test": val_or_test,
	"model": model,
	"model_family": model_family,
	"url": url,
	"path_to_file": path_to_file,
	"organisation": organisation,
	"mail": mail,
	},
	indent=2,
	)
	)

	print("Adding new eval")

	# Check if the combination model/org already exists and prints a warning message if yes
	if model.lower() in set(
	[m.lower() for m in eval_results["basic"]["model"]]
	) and organisation.lower() in set(
	[l.lower() for l in eval_results["basic"]["organisation"]]
	):
	return format_warning("This model has been already submitted.")

	if path_to_file is None:
	return format_warning("Please attach a file.")

	# Save submitted file
	api.upload_file(
	repo_id=SUBMISSION_DATASET,
	path_or_fileobj=path_to_file.name,
	path_in_repo=f"{organisation}/{model}/{val_or_test}_raw_{curr_timestamp}.txt",
	repo_type="dataset",
	token=TOKEN,
	)

	# Compute score
	file_path = path_to_file.name
	with open(f"scored/{organisation}_{model}.json", "w") as scored_file:
	with open(file_path, "r") as f:
	preds = []
	for ix, line in enumerate(f):
	try:
	pred_idx = int(line.strip())
	except Exception:
	return format_error(
	f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file."
	)
	preds.append(pred_idx)
	stem_scores = calc_test_acc(preds)
	scored_file.write(json.dumps(stem_scores, indent=2))

	# Save scored file
	api.upload_file(
	repo_id=SUBMISSION_DATASET,
	path_or_fileobj=f"scored/{organisation}_{model}.json",
	path_in_repo=f"{organisation}/{model}/{val_or_test}_scored_{curr_timestamp}.json",
	repo_type="dataset",
	token=TOKEN,
	)

	# Actual submission
	eval_entry = {
	"model": model,
	"model_family": model_family,
	"url": url,
	"organisation": organisation,
	"submit_date": "\n".join(str(curr_timestamp).split(" ")),
	"science": stem_scores["science"],
	"technology": stem_scores["technology"],
	"engineering": stem_scores["engineer"],
	"math": stem_scores["math"],
	"average": stem_scores["average"],
	}
	eval_results["basic"] = eval_results["basic"].add_item(eval_entry)
	print(eval_results)
	eval_results.push_to_hub(RESULTS_DATASET, token=TOKEN)

	contact_info = {
	"model": model,
	"model_family": model_family,
	"url": url,
	"organisation": organisation,
	"mail": mail,
	"submit_date": "\n".join(str(curr_timestamp).split(" ")),
	}
	contact_infos["basic"] = contact_infos["basic"].add_item(contact_info)
	contact_infos.push_to_hub(CONTACT_DATASET, token=TOKEN)

	return format_log(
	f"Model {model} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed"
	)


	def refresh():
	eval_results = load_dataset(
	RESULTS_DATASET,
	token=TOKEN,
	download_mode="force_redownload",
	verification_mode="no_checks",
	)
	eval_dataframe_test = get_dataframe_from_results(
	eval_results=eval_results, split="basic"
	)
	return eval_dataframe_test


	def upload_file(files):
	file_paths = [file.name for file in files]
	return file_paths


	demo = gr.Blocks()
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Tab("Results: Test"):
	leaderboard_table_test = gr.components.Dataframe(
	value=eval_dataframe_test,
	datatype=TYPES,
	interactive=False,
	wrap=True,
	)

	refresh_button = gr.Button("Refresh")
	refresh_button.click(
	refresh,
	inputs=[],
	outputs=[
	leaderboard_table_test,
	],
	)
	with gr.Accordion("Submit a new model for evaluation"):
	with gr.Row():
	with gr.Column():
	level_of_test = gr.Radio(["test"], value="test", label="Split")
	model_name_textbox = gr.Textbox(label="Model name")
	model_family_textbox = gr.Textbox(label="Model family")
	url_textbox = gr.Textbox(label="Url to model information")
	with gr.Column():
	organisation = gr.Textbox(label="Organisation")
	mail = gr.Textbox(
	label="Contact email (will be stored privately, & used if there is an issue with your submission)"
	)
	file_output = gr.File()

	submit_button = gr.Button("Submit Eval")
	submission_result = gr.Markdown()
	submit_button.click(
	add_new_eval,
	[
	level_of_test,
	model_name_textbox,
	model_family_textbox,
	url_textbox,
	file_output,
	organisation,
	mail,
	],
	submission_result,
	)

	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=3600)
	scheduler.start()
	demo.launch(debug=True, server_name="0.0.0.0")