Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| from huggingface_hub import HfApi, snapshot_download | |
| from src.utils import load_all_data | |
| from src.md import ABOUT_TEXT | |
| import numpy as np | |
| api = HfApi() | |
| COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN") | |
| evals_repo = "ai2-rlhf-collab/rm-benchmark-results" | |
| prefs_repo = "ai2-rlhf-collab/rm-testset-results" | |
| repo_dir_herm = "./evals/herm/" | |
| repo_dir_prefs = "./evals/prefs/" | |
| # def restart_space(): | |
| # api.restart_space(repo_id="ai2-rlhf-collab/rm-benchmark-viewer", token=COLLAB_TOKEN) | |
| print("Pulling evaluation results") | |
| repo = snapshot_download( | |
| local_dir=repo_dir_herm, | |
| repo_id=evals_repo, | |
| use_auth_token=COLLAB_TOKEN, | |
| tqdm_class=None, | |
| etag_timeout=30, | |
| repo_type="dataset", | |
| ) | |
| # repo.git_pull() | |
| repo_pref_sets = snapshot_download( | |
| local_dir=repo_dir_prefs, | |
| repo_id=prefs_repo, | |
| use_auth_token=COLLAB_TOKEN, | |
| tqdm_class=None, | |
| etag_timeout=30, | |
| repo_type="dataset", | |
| ) | |
| # repo_pref_sets.git_pull() | |
| def avg_over_herm(dataframe): | |
| """ | |
| Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns. | |
| """ | |
| subsets = ["alpacaeval", "mt-bench", "llmbar", "refusals", "hep"] | |
| # for each subset, avg the columns that have the subset in the column name, then add a new column with subset name and avg | |
| for subset in subsets: | |
| subset_cols = [col for col in dataframe.columns if subset in col] | |
| dataframe[subset] = np.round(np.nanmean(dataframe[subset_cols].values, axis=1), 2) | |
| keep_columns = ["model", "average"] + subsets | |
| dataframe = dataframe[keep_columns] | |
| # replace average column with new average | |
| dataframe["average"] = np.round(np.nanmean(dataframe[subsets].values, axis=1), 2) | |
| return dataframe | |
| def expand_subsets(dataframe): | |
| # TODO need to modify data/ script to do this | |
| pass | |
| herm_data = load_all_data(repo_dir_herm).sort_values(by='average', ascending=False) | |
| herm_data_avg = avg_over_herm(herm_data).sort_values(by='average', ascending=False) | |
| prefs_data = load_all_data(repo_dir_prefs).sort_values(by='average', ascending=False) | |
| # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False) | |
| col_types_herm = ["markdown"] + ["number"] * (len(herm_data.columns) - 1) | |
| col_types_herm_avg = ["markdown"] + ["number"] * (len(herm_data_avg.columns) - 1) | |
| col_types_prefs = ["markdown"] + ["number"] * (len(prefs_data.columns) - 1) | |
| # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1) | |
| with gr.Blocks() as app: | |
| # create tabs for the app, moving the current table to one titled "HERM" and the benchmark_text to a tab called "About" | |
| with gr.Row(): | |
| gr.Markdown("# HERM Results Viewer") | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| with gr.TabItem("HERM - Overview"): | |
| with gr.Row(): | |
| herm_table = gr.Dataframe( | |
| herm_data_avg.values, | |
| datatype=col_types_herm_avg, | |
| headers=herm_data_avg.columns.tolist(), | |
| elem_id="herm_dataframe_avg", | |
| ) | |
| with gr.TabItem("HERM - Detailed"): | |
| with gr.Row(): | |
| herm_table = gr.Dataframe( | |
| herm_data.values, | |
| datatype=col_types_herm, | |
| headers=herm_data.columns.tolist(), | |
| elem_id="herm_dataframe", | |
| ) | |
| with gr.TabItem("Pref Sets - Overview"): | |
| pref_sets_table = gr.Dataframe( | |
| prefs_data.values, | |
| datatype=col_types_prefs, | |
| headers=prefs_data.columns.tolist(), | |
| elem_id="prefs_dataframe", | |
| ) | |
| with gr.TabItem("About"): | |
| with gr.Row(): | |
| gr.Markdown(ABOUT_TEXT) | |
| # Load data when app starts | |
| def load_data_on_start(): | |
| data_herm = load_all_data(repo_dir_herm) | |
| herm_table.update(data_herm) | |
| data_herm_avg = avg_over_herm(repo_dir_herm) | |
| herm_table.update(data_herm_avg) | |
| data_prefs = load_all_data(repo_dir_prefs) | |
| pref_sets_table.update(data_prefs) | |
| app.launch() | |