Spaces:

allenai
/

reward-bench

Running

App Files Files Community

reward-bench / app.py

natolambert

add token back

e4cd4cd almost 2 years ago

raw

history blame

4.21 kB

	import gradio as gr
	import os
	from huggingface_hub import HfApi, snapshot_download
	from src.utils import load_all_data
	from src.md import ABOUT_TEXT
	import numpy as np

	api = HfApi()

	COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
	evals_repo = "ai2-rlhf-collab/rm-benchmark-results"
	prefs_repo = "ai2-rlhf-collab/rm-testset-results"
	repo_dir_herm = "./evals/herm/"
	repo_dir_prefs = "./evals/prefs/"

	# def restart_space():
	# api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN)



	print("Pulling evaluation results")
	repo = snapshot_download(
	local_dir=repo_dir_herm,
	repo_id=evals_repo,
	use_auth_token=COLLAB_TOKEN,
	tqdm_class=None,
	etag_timeout=30,
	repo_type="dataset",
	)
	# repo.git_pull()

	repo_pref_sets = snapshot_download(
	local_dir=repo_dir_prefs,
	repo_id=prefs_repo,
	use_auth_token=COLLAB_TOKEN,
	tqdm_class=None,
	etag_timeout=30,
	repo_type="dataset",
	)
	# repo_pref_sets.git_pull()

	def avg_over_herm(dataframe):
	"""
	Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
	"""
	subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"]
	# for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg
	for subset in subsets:
	subset_cols = [col for col in dataframe.columns if subset in col]
	dataframe[subset] = np.round(np.nanmean(dataframe[subset_cols].values, axis=1), 2)

	keep_columns = ["model", "average"] + subsets
	dataframe = dataframe[keep_columns]
	# replace average column with new average
	dataframe["average"] = np.round(np.nanmean(dataframe[subsets].values, axis=1), 2)
	return dataframe

	def expand_subsets(dataframe):
	# TODO need to modify data/ script to do this
	pass

	herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False)
	herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False)
	prefs_data = load_all_data(repo_dir_prefs).sort_values(by='average', ascending=False)
	# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)

	col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1)
	col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1)
	col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
	# col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)

	with gr.Blocks() as app:
	# create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About"
	with gr.Row():
	gr.Markdown("# HERM Results Viewer")
	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("HERM - Overview"):
	with gr.Row():
	herm_table = gr.Dataframe(
	herm_data_avg.values,
	datatype=col_types_herm_avg,
	headers=herm_data_avg.columns.tolist(),
	elem_id="herm_dataframe_avg",
	)
	with gr.TabItem("HERM - Detailed"):
	with gr.Row():
	herm_table = gr.Dataframe(
	herm_data.values,
	datatype=col_types_herm,
	headers=herm_data.columns.tolist(),
	elem_id="herm_dataframe",
	)
	with gr.TabItem("Pref Sets - Overview"):
	pref_sets_table = gr.Dataframe(
	prefs_data.values,
	datatype=col_types_prefs,
	headers=prefs_data.columns.tolist(),
	elem_id="prefs_dataframe",
	)

	with gr.TabItem("About"):
	with gr.Row():
	gr.Markdown(ABOUT_TEXT)

	# Load data when app starts
	def load_data_on_start():
	data_herm = load_all_data(repo_dir_herm)
	herm_table.update(data_herm)

	data_herm_avg = avg_over_herm(repo_dir_herm)
	herm_table.update(data_herm_avg)

	data_prefs = load_all_data(repo_dir_prefs)
	pref_sets_table.update(data_prefs)

	app.launch()