evalverse-space / app.py
jihoo-kim's picture
Update score DB 240516
f57813a
raw
history blame
15.4 kB
import gradio as gr
import pandas as pd
import pingouin as pg
import plotly.express as px
import seaborn as sns
from matplotlib import pyplot as plt
my_theme = gr.themes.Soft(
primary_hue="indigo",
secondary_hue="gray",
font=[gr.themes.GoogleFont("Source Sans Pro")],
).set(
body_background_fill="#White",
block_background_fill="White",
button_primary_background_fill="#8B71FF",
button_cancel_text_color="White",
)
sns.set(color_codes=True, font_scale=1.2)
TARGET_DATE = "240516"
SCORE_PATH = f"db/score_{TARGET_DATE}.csv"
score_df = pd.read_csv(SCORE_PATH)
score_df["H6-Avg"] = (
score_df[["ARC-c", "HellaSwag", "MMLU", "TruthfulQA", "WinoGrande", "GSM-8K"]]
.mean(axis=1)
.round(2)
)
AVAILABLE_SCORES = score_df.columns[3:].tolist()
for score in AVAILABLE_SCORES:
if "Arena Elo" in score:
AVAILABLE_SCORES.remove(score)
AVAILABLE_MODELS = score_df["Model"].to_list()
DEFAULT_SCORES = ["ARC-c", "HellaSwag", "MMLU", "TruthfulQA", "WinoGrande", "GSM-8K"]
DEFAULT_MODELS = [
"SOLAR-10.7B-Instruct-v1.0",
"Mistral-7B-Instruct-v0.2",
"Gemma-7B-it",
"Llama-3-8b-instruct",
]
def get_report(models_list, benchmarks_list):
global score_df
report_df = score_df.copy()
report_df["MT-Bench (x10)"] = report_df["MT-Bench"] * 10
report_df = report_df[report_df["Model"].isin(models_list)]
table = report_df[["Organization", "Model", "Size"] + benchmarks_list].copy()
table["Total_avg"] = table[benchmarks_list].mean(axis=1).round(2)
table["Ranking"] = table["Total_avg"].rank(ascending=False).astype(int)
table = table.sort_values("Ranking").reset_index(drop=True)
rank_table = table[["Organization", "Model", "Size", "Ranking", "Total_avg"]]
score_table = table[["Model"] + benchmarks_list]
if "MT-Bench" in benchmarks_list:
benchmarks_list.remove("MT-Bench")
benchmarks_list.append("MT-Bench (x10)")
scores = []
for b in benchmarks_list:
for m, n in report_df[["Model", b]].values:
scores.append([m, b, n])
figure_df = pd.DataFrame(scores, columns=["model", "benchmark", "score"])
fig = px.line_polar(
figure_df,
r="score",
theta="benchmark",
line_close=True,
category_orders={"benchmark": benchmarks_list},
color="model",
markers=True,
color_discrete_sequence=px.colors.qualitative.Pastel,
title="LLM Evaluation Report (by Evalverse)",
width=800,
)
return fig, rank_table, score_table
def get_corr_table(benchmarks_list=None):
global score_df
if benchmarks_list:
benchmarks_list = [f"Arena Elo ({TARGET_DATE})"] + benchmarks_list
else:
benchmarks_list = score_df.columns[3:]
corr_table = score_df[benchmarks_list].pairwise_corr(method="pearson")
return corr_table
def get_corr_figure(benchmarks_list=None):
global score_df
if benchmarks_list:
benchmarks_list = [f"Arena Elo ({TARGET_DATE})"] + benchmarks_list
else:
benchmarks_list = score_df.columns[3:]
corr_values = score_df[benchmarks_list].corr()
plt.figure(figsize=(21, 14))
sns.heatmap(corr_values, annot=True, cmap="RdBu", linewidths=3)
plt.xticks(rotation=45)
plt.title("Correlation - LLM Benchmarks", size=30)
return plt
def get_analysis_figure(bench_name):
global score_df
fig = px.scatter(
score_df,
x=bench_name,
y=f"Arena Elo ({TARGET_DATE})",
marginal_x="histogram",
marginal_y="histogram",
width=450,
hover_data=["Organization", "Model"],
trendline="ols",
trendline_color_override="#27138F",
)
return fig
report_plot, rank_table, score_table = get_report(DEFAULT_MODELS, DEFAULT_SCORES)
corr_table = get_corr_table()
with gr.Blocks(theme=my_theme) as demo:
with gr.Row():
gr.Image(
"asset/evalverse_logo.png",
show_label=False,
show_download_button=False,
scale=0.4,
)
with gr.Row():
gr.Markdown(
"""
The Universe of Evaluation. All about the evaluation for LLMs.\n
Run an evaluation for your LLM with **`Evalverse`** [[Github](https://github.com/UpstageAI/evalverse) β€’ [Paper](https://arxiv.org/abs/2404.00943) β€’ [Docs](https://evalverse.gitbook.io/evalverse-docs)].
"""
)
with gr.Tab("πŸ“Š LLM Evaluation Report"):
with gr.Row():
model_list = gr.Dropdown(
AVAILABLE_MODELS,
value=DEFAULT_MODELS,
multiselect=True,
label="Models",
info="Select models to evaluate",
)
bench_list = gr.Dropdown(
AVAILABLE_SCORES,
value=DEFAULT_SCORES,
multiselect=True,
label="Benchmarks",
info="Select benchmarks to evaluate",
)
btn = gr.Button("Report!", variant="primary")
with gr.Row():
output_figure = gr.Plot(report_plot, label="Report")
with gr.Row():
gr.Markdown("## Summary")
with gr.Row():
output_rank_table = gr.DataFrame(rank_table)
with gr.Row():
gr.Markdown("## Detailed scores")
with gr.Row():
output_score_table = gr.DataFrame(score_table)
btn.click(
fn=get_report,
inputs=[model_list, bench_list],
outputs=[output_figure, output_rank_table, output_score_table],
)
with gr.Tab("🧐 LLM Evaluation Analysis"):
with gr.Row():
bench_a = gr.Dropdown(
AVAILABLE_SCORES,
value="MT-Bench",
label="A Benchmark",
info="Select a benchmark to analyze the correlation with Arena Elo",
)
bench_b = gr.Dropdown(
AVAILABLE_SCORES,
value="H6-Avg",
label="B Benchmark",
info="Select a benchmark to analyze the correlation with Arena Elo",
)
with gr.Row():
btn_a = gr.Button("Analyze A!", variant="primary")
btn_b = gr.Button("Analyze B!", variant="primary")
with gr.Row():
mtbench_figure = get_analysis_figure("MT-Bench")
h6avg_figure = get_analysis_figure("H6-Avg")
figure_a = gr.Plot(mtbench_figure, label="Selected A")
figure_b = gr.Plot(h6avg_figure, label="Selected B")
btn_a.click(fn=get_analysis_figure, inputs=bench_a, outputs=figure_a)
btn_b.click(fn=get_analysis_figure, inputs=bench_b, outputs=figure_b)
with gr.Row():
gr.Markdown("## Analysis")
with gr.Row():
corr_figure = get_corr_figure()
output_corr_figure = gr.Plot(corr_figure, label="Correlations")
with gr.Row():
output_corr_table = gr.DataFrame(corr_table, label="Detailed statistics")
with gr.Row():
gr.Markdown(
"""
- `X`: Name(s) of first columns.
- `Y`: Name(s) of second columns.
- `method`: Correlation type.
- `alternative`: Tail of the test.
- `n`: Sample size (after removal of missing values).
- `r`: Correlation coefficients.
- `CI95`': 95% parametric confidence intervals.
- `p-unc`: Uncorrected p-values.
- `BF10`: Bayes Factor of the alternative hypothesis (only for Pearson correlation)
- `power`: achieved power of the test (= 1 - type II error).
Reference: https://pingouin-stats.org/build/html/generated/pingouin.pairwise_corr.html#pingouin.pairwise_corr
"""
)
with gr.Tab("🌌 About Evalverse"):
gr.Markdown(
"""
## 🌌 Introduction
**Evalverse** is a freely accessible, open-source project designed to support your LLM (Large Language Model) evaluation needs. We provide a simple, standardized, and user-friendly solution for the processing and management of LLM evaluations, catering to the needs of AI research engineers and scientists. We also support no-code evaluation processes for people who may have less experience working with LLMs. Moreover, you will receive a well-organized report with figures summarizing the evaluation results.
"""
)
with gr.Row():
gr.Image(
"asset/overview.png",
show_label=False,
show_download_button=False,
scale=0.6,
)
gr.Markdown(
"""
### With Evalverse, you are empowered to
- access various evaluation methods without juggling multiple libraries.
- receive insightful report about the evaluation results that helps you to compare the varied scores across different models.
- initiate evaluation and generate reports without any code via Slack bot.
## 🌌 Architecture of Evalverse
"""
)
with gr.Row():
gr.Image(
"asset/architecture.png",
show_label=False,
show_download_button=False,
scale=0.8,
)
gr.Markdown(
"""
- `Submodule`. The Submodule serves as the evaluation engine that is responsible for the heavy lifting involved in evaluating LLMs. Publicly available LLM evaluation libraries can be integrated into Evalverse as submodules. This component makes Evalverse expandable, thereby ensuring that the library remains up-to-date.
- `Connector`. The Connector plays a role in linking the Submodules with the Evaluator. It contains evaluation scripts, along with the necessary arguments, from various external libraries.
- `Evaluator`. The Evaluator performs the requested evaluations on the Compute Cluster by utilizing the evaluation scripts from the Connector. The Evaluator can receive evaluation requests either from the Reporter, which facilitates a no-code evaluation approach, or directly from the end-user for code-based evaluation.
- `Compute Cluster`. The Compute Cluster is the collection of hardware accelerators needed to execute the LLM evaluation processes. When the Evaluator schedules an evaluation job to be ran, the Compute Cluster fetches the required model and data files from the Database. The results of the evaluation jobs are sent to the Database for storage.
- `Database`. The Database stores the model files and data needed in the evaluation processes, along with evaluation results. The stored evaluation results are used by the Reporter to create evaluation reports for the user.
- `Reporter`. The Reporter handles the evaluation and report requests sent by the users, allowing for a no-code approach to LLM evaluation. The Reporter sends the requested evaluation jobs to the Evaluator and fetches the evaluation results from the Database, which are sent to the user via an external communication platform such as Slack. Through this, users can receive table and figure that summarize evaluation results.
## 🌌 Key Features of Evalverse
- **Unified evaluation with Submodules**: Evalverse extends its evaluation capabilities through Git submodules, effortlessly incorporating frameworks like [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) and [FastChat](https://github.com/lm-sys/FastChat). Swiftly add new tools and keep pace with the latest in LLM evaluation.
- **No-code evaluation request**: With Evalverse, request LLM evaluations without any code, simply by sending `Request!` in a direct message or Slack channel with an activate Evalverse Slack bot. Enter the model name in the Huggingface hub or local model directory path in Slack, and let the bot handle the rest.
- **LLM evaluation report**: Obtain comprehensive, no-code reports from Evalverse. Request with a simple command -`Report!`-, select the model and evaluation criteria, and receive detailed reports with scores, rankings, and visuals, all generated from the stored score database.
## 🌌 Supported Evaluations
We currently support four evaluation methods. If you have suggestions for new methods, we welcome your input!
| Evaluation | Original Repository |
|---------------------------|--------------------------------------------|
| H6 (Open LLM Leaderboard) | [EleutherAI](https://github.com/EleutherAI)/[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)|
| MT-bench | [lm-sys](https://github.com/lm-sys)/[FastChat](https://github.com/lm-sys/FastChat)|
| IFEval | [google-research](https://github.com/google-research/google-research/tree/master)/[instruction_following_eval](https://github.com/google-research/google-research/tree/master/instruction_following_eval)|
| EQ-Bench | [EQ-bench](https://github.com/EQ-bench)/[EQ-Bench](https://github.com/EQ-bench/EQ-Bench)|
## 🌌 Acknowledgements
Evalverse is an open-source project orchestrated by the **Data-Centric LLM Team** at `Upstage`, designed as an ecosystem for LLM evaluation. Launched in April 2024, this initiative stands at the forefront of advancing evaluation handling in the realm of large language models (LLMs).
## 🌌 License
Evalverse is completely freely-accessible open-source and licensed under the Apache License 2.0.
## 🌌 Citation
If you want to cite our 🌌 Evalverse project, feel free to use the following bibtex. You can check our paper via [link](https://arxiv.org/abs/2404.00943).
```bibtex
@misc{kim2024evalverse,
title={Evalverse: Unified and Accessible Library for Large Language Model Evaluation},
author={Jihoo Kim and Wonho Song and Dahyun Kim and Yunsu Kim and Yungi Kim and Chanjun Park},
year={2024},
eprint={2404.00943},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
"""
)
with gr.Row():
with gr.Accordion("The scores are collected from ...", open=False):
gr.Markdown(
"""
- [HuggingFace Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
- [LMSYS Chatbot Arena Leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard)
- [EQ-Bench Leaderboard](https://eqbench.com/)
- [Arena-Hard Leaderboard](https://lmsys.org/blog/2024-04-19-arena-hard/#full-leaderboard-with-gpt-4-turbo-as-judge)
- [AlpacaEval Leaderboard](https://tatsu-lab.github.io/alpaca_eval/)
- Results from [Evalverse](https://github.com/UpstageAI/evalverse)
"""
)
if __name__ == "__main__":
demo.launch()