benchbench

Running

App Files Files Community

Yotam-Perlitz commited on Sep 13

Commit

b5e722a

•

1 Parent(s): 5c9c592

improve writings

Browse files

Signed-off-by: Yotam-Perlitz <y.perlitz@ibm.com>

Files changed (1) hide show

app.py +69 -21

app.py CHANGED Viewed

@@ -51,16 +51,17 @@ st.divider()
 st.markdown(
     """
-    The BenchBench leaderboard ranks benchmarks based on their agreement with the *Aggregate Benchmark* – a comprehensive, combined measure of existing benchmark results.
-    \n
-    To achive this, we scraped results from multiple benchmarks (citations below) to allow for obtaining benchmark agreement results with a wide range of benchmark using a large set of models.
-    \n
-    BenchBench is for you if:
     """
 )
 st.markdown(
     """
     - **You have a new benchmark**: Show that it agrees/disagrees with known benchmarks.
     - **You are looking for a benchmark to run/trust**: Find an efficient/private/preferble alternative.
     """
@@ -68,11 +69,10 @@ st.markdown(
 st.markdown(
     """
-    In our work - [Benchmark Agreement Testing Done Right](https://arxiv.org/abs/2407.13696) and [opensource repo](https://github.com/IBM/benchbench),
-    we standardize BAT and show the importance of its configurations, notably,
-    the benchmarks we compare to, and the models we use to compare with (see sidebar).
     \n
-    We also show that agreements are best represented with the relative agreement (Z Score) of each benchmark to the Aggragate benchmark, as presented below in the leaderboard.
     """
 )
@@ -340,7 +340,8 @@ z_scores["date"] = z_scores["source"].apply(
 z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
-z_score_name = "Relative agreement (Z Score)"
 data = (
     z_scores.rename(
@@ -348,7 +349,7 @@ data = (
             "scenario": "Benchmark",
             "z_score": z_score_name,
             "corr_with_agg": corr_name,
-            "p_value_of_corr_with_agg": "p-value of Corr.",
             # "n_models_of_corr_with_agg": "# Models Used",
             "source": "Source",
             "date": "Snapshot Date",
@@ -376,12 +377,12 @@ styled_data = (
     )
     .apply(highlight_uploaded_benchmark, axis=1)
     .background_gradient(
-        subset=["p-value of Corr."],
         cmap="Reds",
         vmin=0.1,
         vmax=1,
     )
-    .format(subset=[z_score_name, corr_name, "p-value of Corr."], formatter="{:.2}")
     .set_properties(**{"text-align": "center"})
 )
@@ -389,7 +390,7 @@ cols_used = [
     "Benchmark",
     z_score_name,
     corr_name,
-    "p-value of Corr.",
     "Snapshot Date",
 ]
@@ -399,7 +400,7 @@ st.dataframe(
     column_order=cols_used,
     hide_index=True,
     use_container_width=True,
-    height=500,
     column_config={col: {"alignment": "center"} for col in cols_used},
 )
@@ -420,9 +421,41 @@ with st.expander(label="Aggragate Benchmark scores"):
         use_container_width=True,
     )
-with st.expander(label="Citations"):
-    st.code(
-        r"""
     @misc{berkeley-function-calling-leaderboard,
         title={Berkeley Function Calling Leaderboard},
@@ -694,7 +727,7 @@ with st.expander(label="Citations"):
     }
     """
-    )
 st.subheader("Benchmark Report Card")
@@ -714,9 +747,9 @@ plotted_scenario = st.selectbox(
 col1, col2, col3 = st.columns(3)
 cur_data = data.query(f"Benchmark=='{plotted_scenario}'")
-col1.metric("Relative agreement", cur_data["Relative agreement (Z Score)"])
 col2.metric(corr_name, cur_data[corr_name])
-col3.metric("p-value of Corr.", cur_data["p-value of Corr."])
 cur_df = allbench.df.query(f'scenario=="aggregate" or scenario=="{plotted_scenario}"')
@@ -837,3 +870,18 @@ st.image(
     caption="**BenchBench's Standardized Approach Reduces Variance.** This ablation study demonstrates that following the best practices implemented in BenchBench significantly reduces the variance of BAT results, leading to more robust and reliable conclusions.",
     use_column_width=True,
 )

 st.markdown(
     """
+    BenchBench rates benchmarks according to their agreement with the defined *Aggregate Benchmark* –
+    an enhanced representation of the benchmarks that are out there (see config in sidebar to modify).
     """
 )
 st.markdown(
     """
+    BenchBench is for you if:
+    \n
     - **You have a new benchmark**: Show that it agrees/disagrees with known benchmarks.
     - **You are looking for a benchmark to run/trust**: Find an efficient/private/preferble alternative.
     """
 st.markdown(
     """
+    We also show that agreements are best represented with the the BenchBench Score,
+    the relative agreement (Z Score) of each benchmark to the Aggragate benchmark.
     \n
+    Read more in our work [Benchmark Agreement Testing Done Right](https://arxiv.org/abs/2407.13696) and the [BenchBench repo](https://github.com/IBM/benchbench)
     """
 )
 z_scores["date"] = pd.to_datetime("20" + z_scores["date"]).dt.date
+z_score_name = "BenchBench Score"
+p_val_name = "p val"
 data = (
     z_scores.rename(
             "scenario": "Benchmark",
             "z_score": z_score_name,
             "corr_with_agg": corr_name,
+            "p_value_of_corr_with_agg": p_val_name,
             # "n_models_of_corr_with_agg": "# Models Used",
             "source": "Source",
             "date": "Snapshot Date",
     )
     .apply(highlight_uploaded_benchmark, axis=1)
     .background_gradient(
+        subset=[p_val_name],
         cmap="Reds",
         vmin=0.1,
         vmax=1,
     )
+    .format(subset=[z_score_name, corr_name, p_val_name], formatter="{:.2}")
     .set_properties(**{"text-align": "center"})
 )
     "Benchmark",
     z_score_name,
     corr_name,
+    p_val_name,
     "Snapshot Date",
 ]
     column_order=cols_used,
     hide_index=True,
     use_container_width=True,
+    height=300,
     column_config={col: {"alignment": "center"} for col in cols_used},
 )
         use_container_width=True,
     )
+left, right = st.columns([1, 1])
+with left:
+    with st.expander(label="Cite Us!"):
+        st.code(
+            r"""
+    @misc{perlitz2024llmbenchmarksagreefixing,
+        title={Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with BenchBench},
+        author={Yotam Perlitz and Ariel Gera and Ofir Arviv and Asaf Yehudai and Elron Bandel and Eyal Shnarch and Michal Shmueli-Scheuer and Leshem Choshen},
+        year={2024},
+        eprint={2407.13696},
+        archivePrefix={arXiv},
+        primaryClass={cs.CL},
+        url={https://arxiv.org/abs/2407.13696},
+    }
+"""
+        )
+with right:
+    with st.expander(label="Cite Everyone Else!"):
+        st.code(
+            r"""
+    @misc{perlitz2024llmbenchmarksagreefixing,
+        title={Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with BenchBench},
+        author={Yotam Perlitz and Ariel Gera and Ofir Arviv and Asaf Yehudai and Elron Bandel and Eyal Shnarch and Michal Shmueli-Scheuer and Leshem Choshen},
+        year={2024},
+        eprint={2407.13696},
+        archivePrefix={arXiv},
+        primaryClass={cs.CL},
+        url={https://arxiv.org/abs/2407.13696},
+    }
     @misc{berkeley-function-calling-leaderboard,
         title={Berkeley Function Calling Leaderboard},
     }
     """
+        )
 st.subheader("Benchmark Report Card")
 col1, col2, col3 = st.columns(3)
 cur_data = data.query(f"Benchmark=='{plotted_scenario}'")
+col1.metric("Relative agreement", cur_data[z_score_name])
 col2.metric(corr_name, cur_data[corr_name])
+col3.metric("p-value of Corr.", cur_data[p_val_name])
 cur_df = allbench.df.query(f'scenario=="aggregate" or scenario=="{plotted_scenario}"')
     caption="**BenchBench's Standardized Approach Reduces Variance.** This ablation study demonstrates that following the best practices implemented in BenchBench significantly reduces the variance of BAT results, leading to more robust and reliable conclusions.",
     use_column_width=True,
 )
+st.code(
+    r"""
+    @misc{perlitz2024llmbenchmarksagreefixing,
+        title={Do These LLM Benchmarks Agree? Fixing Benchmark Evaluation with BenchBench},
+        author={Yotam Perlitz and Ariel Gera and Ofir Arviv and Asaf Yehudai and Elron Bandel and Eyal Shnarch and Michal Shmueli-Scheuer and Leshem Choshen},
+        year={2024},
+        eprint={2407.13696},
+        archivePrefix={arXiv},
+        primaryClass={cs.CL},
+        url={https://arxiv.org/abs/2407.13696},
+    }
+"""
+)