Spaces:
Running
Running
File size: 16,150 Bytes
fea7ea6 399f6b4 fea7ea6 399f6b4 fea7ea6 f57813a fea7ea6 399f6b4 fea7ea6 f57813a fea7ea6 399f6b4 fea7ea6 f57813a fea7ea6 f57813a fea7ea6 399f6b4 fea7ea6 399f6b4 fea7ea6 f57813a 399f6b4 f57813a fea7ea6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 |
import gradio as gr
import pandas as pd
import pingouin as pg
import plotly.express as px
import seaborn as sns
from matplotlib import pyplot as plt
my_theme = gr.themes.Soft(
primary_hue="indigo",
secondary_hue="gray",
font=[gr.themes.GoogleFont("Source Sans Pro")],
).set(
body_background_fill="#White",
block_background_fill="White",
button_primary_background_fill="#8B71FF",
button_cancel_text_color="White",
)
sns.set(color_codes=True, font_scale=1.2)
TARGET_DATE = "240515"
SCORE_PATH = f"db/score_240517.csv"
score_df = pd.read_csv(SCORE_PATH)
score_df["H6-Avg"] = (
score_df[["ARC-c", "HellaSwag", "MMLU", "TruthfulQA", "WinoGrande", "GSM-8K"]]
.mean(axis=1)
.round(2)
)
AVAILABLE_SCORES = score_df.columns[8:].tolist()
AVAILABLE_MODELS = score_df["Model"].to_list()
DEFAULT_SCORES = ["ARC-c", "HellaSwag", "MMLU", "TruthfulQA", "WinoGrande", "GSM-8K"]
DEFAULT_MODELS = [
"SOLAR-10.7B-Instruct-v1.0",
"Mistral-7B-Instruct-v0.2",
"Gemma-7B-it",
"Llama-3-8b-instruct",
]
def get_report(models_list, benchmarks_list):
global score_df
report_df = score_df.copy()
report_df["MT-Bench (x10)"] = report_df["MT-Bench"] * 10
report_df = report_df[report_df["Model"].isin(models_list)]
table = report_df[["Organization", "Model", "Size"] + benchmarks_list].copy()
table["Total_avg"] = table[benchmarks_list].mean(axis=1).round(2)
table["Ranking"] = table["Total_avg"].rank(ascending=False).astype(int)
table = table.sort_values("Ranking").reset_index(drop=True)
rank_table = table[["Organization", "Model", "Size", "Ranking", "Total_avg"]]
score_table = table[["Model"] + benchmarks_list]
if "MT-Bench" in benchmarks_list:
benchmarks_list.remove("MT-Bench")
benchmarks_list.append("MT-Bench (x10)")
scores = []
for b in benchmarks_list:
for m, n in report_df[["Model", b]].values:
scores.append([m, b, n])
figure_df = pd.DataFrame(scores, columns=["model", "benchmark", "score"])
fig = px.line_polar(
figure_df,
r="score",
theta="benchmark",
line_close=True,
category_orders={"benchmark": benchmarks_list},
color="model",
markers=True,
color_discrete_sequence=px.colors.qualitative.Pastel,
title="LLM Evaluation Report (by Evalverse)",
width=800,
)
return fig, rank_table, score_table
def get_corr_table(benchmarks_list=None):
global score_df
if benchmarks_list:
benchmarks_list = [f"Arena Elo ({TARGET_DATE})"] + benchmarks_list
else:
benchmarks_list = score_df.columns[4:]
corr_table = score_df[benchmarks_list].pairwise_corr(method="pearson")
return corr_table
def get_corr_figure(benchmarks_list=None):
global score_df
if benchmarks_list:
benchmarks_list = [f"Arena Elo ({TARGET_DATE})"] + benchmarks_list
else:
benchmarks_list = score_df.columns[4:]
corr_values = score_df[benchmarks_list].corr()
plt.figure(figsize=(21, 14))
sns.heatmap(corr_values, annot=True, cmap="RdBu", linewidths=3)
plt.xticks(rotation=45)
plt.title("Correlation - LLM Benchmarks", size=30)
return plt
def get_analysis_figure(bench_name):
global score_df
fig = px.scatter(
score_df,
x=bench_name,
y=f"Arena Elo ({TARGET_DATE})",
marginal_x="histogram",
marginal_y="histogram",
width=450,
hover_data=["Organization", "Model"],
trendline="ols",
trendline_color_override="#27138F",
)
return fig
report_plot, rank_table, score_table = get_report(DEFAULT_MODELS, DEFAULT_SCORES)
corr_table = get_corr_table()
with gr.Blocks(theme=my_theme) as demo:
with gr.Row():
gr.Image(
"asset/evalverse_logo.png",
show_label=False,
show_download_button=False,
scale=0.4,
)
with gr.Row():
gr.Markdown(
"""
The Universe of Evaluation. All about the evaluation for LLMs.\n
Run an evaluation for your LLM with **`Evalverse`** [[Github](https://github.com/UpstageAI/evalverse) β’ [Paper](https://arxiv.org/abs/2404.00943) β’ [Docs](https://evalverse.gitbook.io/evalverse-docs)].
### π Newly updated
[2024.05.17]
- Weekly scores: `Arena Elo (240515)`, `Arena Elo (240508)`, `Arena Elo (240501)`
- New benchmarks: [`AlpacaEval 2.0`](https://tatsu-lab.github.io/alpaca_eval/), [`MMLU-Pro`](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)
- New models: `GPT-4o-0513`, `Grok-1`, `OpenELM`, `Qwen-Max-0428`, `Snowflake-Arctic-Instruct`, `Yi-Large`
- New tab: `π Full leaderboard`
"""
)
with gr.Tab("π LLM Evaluation Report"):
with gr.Row():
model_list = gr.Dropdown(
AVAILABLE_MODELS,
value=DEFAULT_MODELS,
multiselect=True,
label="Models",
info="Select models to evaluate",
)
bench_list = gr.Dropdown(
AVAILABLE_SCORES,
value=DEFAULT_SCORES,
multiselect=True,
label="Benchmarks",
info="Select benchmarks to evaluate",
)
btn = gr.Button("Report!", variant="primary")
with gr.Row():
output_figure = gr.Plot(report_plot, label="Report")
with gr.Row():
gr.Markdown("## Summary")
with gr.Row():
output_rank_table = gr.DataFrame(rank_table)
with gr.Row():
gr.Markdown("## Detailed scores")
with gr.Row():
output_score_table = gr.DataFrame(score_table)
btn.click(
fn=get_report,
inputs=[model_list, bench_list],
outputs=[output_figure, output_rank_table, output_score_table],
)
with gr.Tab("π§ LLM Evaluation Analysis"):
with gr.Row():
bench_a = gr.Dropdown(
AVAILABLE_SCORES,
value="MT-Bench",
label="A Benchmark",
info="Select a benchmark to analyze the correlation with Arena Elo",
)
bench_b = gr.Dropdown(
AVAILABLE_SCORES,
value="H6-Avg",
label="B Benchmark",
info="Select a benchmark to analyze the correlation with Arena Elo",
)
with gr.Row():
btn_a = gr.Button("Analyze A!", variant="primary")
btn_b = gr.Button("Analyze B!", variant="primary")
with gr.Row():
mtbench_figure = get_analysis_figure("MT-Bench")
h6avg_figure = get_analysis_figure("H6-Avg")
figure_a = gr.Plot(mtbench_figure, label="Selected A")
figure_b = gr.Plot(h6avg_figure, label="Selected B")
btn_a.click(fn=get_analysis_figure, inputs=bench_a, outputs=figure_a)
btn_b.click(fn=get_analysis_figure, inputs=bench_b, outputs=figure_b)
with gr.Row():
gr.Markdown("## Analysis")
with gr.Row():
corr_figure = get_corr_figure()
output_corr_figure = gr.Plot(corr_figure, label="Correlations")
with gr.Row():
output_corr_table = gr.DataFrame(corr_table, label="Detailed statistics")
with gr.Row():
gr.Markdown(
"""
- `X`: Name(s) of first columns.
- `Y`: Name(s) of second columns.
- `method`: Correlation type.
- `alternative`: Tail of the test.
- `n`: Sample size (after removal of missing values).
- `r`: Correlation coefficients.
- `CI95`': 95% parametric confidence intervals.
- `p-unc`: Uncorrected p-values.
- `BF10`: Bayes Factor of the alternative hypothesis (only for Pearson correlation)
- `power`: achieved power of the test (= 1 - type II error).
Reference: https://pingouin-stats.org/build/html/generated/pingouin.pairwise_corr.html#pingouin.pairwise_corr
"""
)
with gr.Tab("π Full leaderboard"):
lb_selected = ["Arena Elo (240515)", "MT-Bench", "MMLU", "Arena-Hard", "EQ-Bench", "MAGI-Hard", "LC-AlpacaEval-2.0", "MMLU-Pro", "H6-Avg"]
lb = score_df[["Organization", "Model", "Size"] + lb_selected]
lb = lb.sort_values(lb_selected, ascending=False)
gr.DataFrame(lb)
with gr.Tab("π About Evalverse"):
gr.Markdown(
"""
## π Introduction
**Evalverse** is a freely accessible, open-source project designed to support your LLM (Large Language Model) evaluation needs. We provide a simple, standardized, and user-friendly solution for the processing and management of LLM evaluations, catering to the needs of AI research engineers and scientists. We also support no-code evaluation processes for people who may have less experience working with LLMs. Moreover, you will receive a well-organized report with figures summarizing the evaluation results.
"""
)
with gr.Row():
gr.Image(
"asset/overview.png",
show_label=False,
show_download_button=False,
scale=0.6,
)
gr.Markdown(
"""
### With Evalverse, you are empowered to
- access various evaluation methods without juggling multiple libraries.
- receive insightful report about the evaluation results that helps you to compare the varied scores across different models.
- initiate evaluation and generate reports without any code via Slack bot.
## π Architecture of Evalverse
"""
)
with gr.Row():
gr.Image(
"asset/architecture.png",
show_label=False,
show_download_button=False,
scale=0.8,
)
gr.Markdown(
"""
- `Submodule`. The Submodule serves as the evaluation engine that is responsible for the heavy lifting involved in evaluating LLMs. Publicly available LLM evaluation libraries can be integrated into Evalverse as submodules. This component makes Evalverse expandable, thereby ensuring that the library remains up-to-date.
- `Connector`. The Connector plays a role in linking the Submodules with the Evaluator. It contains evaluation scripts, along with the necessary arguments, from various external libraries.
- `Evaluator`. The Evaluator performs the requested evaluations on the Compute Cluster by utilizing the evaluation scripts from the Connector. The Evaluator can receive evaluation requests either from the Reporter, which facilitates a no-code evaluation approach, or directly from the end-user for code-based evaluation.
- `Compute Cluster`. The Compute Cluster is the collection of hardware accelerators needed to execute the LLM evaluation processes. When the Evaluator schedules an evaluation job to be ran, the Compute Cluster fetches the required model and data files from the Database. The results of the evaluation jobs are sent to the Database for storage.
- `Database`. The Database stores the model files and data needed in the evaluation processes, along with evaluation results. The stored evaluation results are used by the Reporter to create evaluation reports for the user.
- `Reporter`. The Reporter handles the evaluation and report requests sent by the users, allowing for a no-code approach to LLM evaluation. The Reporter sends the requested evaluation jobs to the Evaluator and fetches the evaluation results from the Database, which are sent to the user via an external communication platform such as Slack. Through this, users can receive table and figure that summarize evaluation results.
## π Key Features of Evalverse
- **Unified evaluation with Submodules**: Evalverse extends its evaluation capabilities through Git submodules, effortlessly incorporating frameworks like [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) and [FastChat](https://github.com/lm-sys/FastChat). Swiftly add new tools and keep pace with the latest in LLM evaluation.
- **No-code evaluation request**: With Evalverse, request LLM evaluations without any code, simply by sending `Request!` in a direct message or Slack channel with an activate Evalverse Slack bot. Enter the model name in the Huggingface hub or local model directory path in Slack, and let the bot handle the rest.
- **LLM evaluation report**: Obtain comprehensive, no-code reports from Evalverse. Request with a simple command -`Report!`-, select the model and evaluation criteria, and receive detailed reports with scores, rankings, and visuals, all generated from the stored score database.
## π Supported Evaluations
We currently support four evaluation methods. If you have suggestions for new methods, we welcome your input!
| Evaluation | Original Repository |
|---------------------------|--------------------------------------------|
| H6 (Open LLM Leaderboard) | [EleutherAI](https://github.com/EleutherAI)/[lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)|
| MT-bench | [lm-sys](https://github.com/lm-sys)/[FastChat](https://github.com/lm-sys/FastChat)|
| IFEval | [google-research](https://github.com/google-research/google-research/tree/master)/[instruction_following_eval](https://github.com/google-research/google-research/tree/master/instruction_following_eval)|
| EQ-Bench | [EQ-bench](https://github.com/EQ-bench)/[EQ-Bench](https://github.com/EQ-bench/EQ-Bench)|
## π Acknowledgements
Evalverse is an open-source project orchestrated by the **Data-Centric LLM Team** at `Upstage`, designed as an ecosystem for LLM evaluation. Launched in April 2024, this initiative stands at the forefront of advancing evaluation handling in the realm of large language models (LLMs).
## π License
Evalverse is completely freely-accessible open-source and licensed under the Apache License 2.0.
## π Citation
If you want to cite our π Evalverse project, feel free to use the following bibtex. You can check our paper via [link](https://arxiv.org/abs/2404.00943).
```bibtex
@misc{kim2024evalverse,
title={Evalverse: Unified and Accessible Library for Large Language Model Evaluation},
author={Jihoo Kim and Wonho Song and Dahyun Kim and Yunsu Kim and Yungi Kim and Chanjun Park},
year={2024},
eprint={2404.00943},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
"""
)
with gr.Row():
with gr.Accordion("The scores are collected from ...", open=False):
gr.Markdown(
"""
- [HuggingFace Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
- [LMSYS Chatbot Arena Leaderboard](https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard)
- [EQ-Bench Leaderboard](https://eqbench.com/)
- [Arena-Hard Leaderboard](https://lmsys.org/blog/2024-04-19-arena-hard/#full-leaderboard-with-gpt-4-turbo-as-judge)
- [AlpacaEval Leaderboard](https://tatsu-lab.github.io/alpaca_eval/)
- [MMLU-Pro Leaderboard](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro#4-leaderboard)
- Results from [Evalverse](https://github.com/UpstageAI/evalverse)
"""
)
if __name__ == "__main__":
demo.launch()
|