File size: 2,887 Bytes
1d4a48e
6f3a090
3c37eb3
c8763bd
a8a6326
 
 
51a4daf
 
a8a6326
 
51a4daf
a8a6326
 
51a4daf
a8a6326
76b423c
51a4daf
 
a8a6326
 
 
 
 
 
76b423c
a8a6326
 
 
 
 
 
483e3a1
 
 
bee5389
ab5f5f1
2ff4a74
3c37eb3
9dc4521
 
bee5389
f45c3f0
9dc4521
2ff4a74
00642fb
ad5bd56
9dc4521
bee5389
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
LOGO = '<img src="https://raw.githubusercontent.com/huggingface/optimum-benchmark/main/logo.png">'

TITLE = """<h1 align="center" id="space-title">πŸ€— LLM-Perf Leaderboard πŸ‹οΈ</h1>"""

ABOUT = """
## πŸ“ About
The πŸ€— LLM-Perf Leaderboard πŸ‹οΈ is a laderboard at the intersection of quality and performance.
Its aim is to benchmark the performance (latency, throughput, memory & energy)
of Large Language Models (LLMs) with different hardwares, backends and optimizations
using [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark).

Anyone from the community can request a new base model or hardware/backend/optimization
configuration for automated benchmarking:

- Model evaluation requests should be made in the
[πŸ€— Open LLM Leaderboard πŸ…](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) ;
we scrape the [list of canonical base models](https://github.com/huggingface/optimum-benchmark/blob/main/llm_perf/utils.py) from there.
- Hardware/Backend/Optimization configuration requests should be made in the
[πŸ€— LLM-Perf Leaderboard πŸ‹οΈ](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) or
[Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) repository (where the code is hosted).

## ✍️ Details

- To avoid communication-dependent results, only one GPU is used.
- Score is the average evaluation score obtained from the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
- LLMs are running on a singleton batch with a prompt size of 256 and generating a 64 tokens for at least 10 iterations and 10 seconds.
- Energy consumption is measured in kWh using CodeCarbon and taking into consideration the GPU, CPU, RAM and location of the machine.
- We measure three types of memory: Max Allocated Memory, Max Reserved Memory and Max Used Memory. The first two being reported by PyTorch and the last one being observed using PyNVML.

All of our benchmarks are ran by this single script
[benchmark_cuda_pytorch.py](https://github.com/huggingface/optimum-benchmark/blob/llm-perf/llm-perf/benchmark_cuda_pytorch.py)
using the power of [Optimum-Benhcmark](https://github.com/huggingface/optimum-benchmark) to garantee reproducibility and consistency.
"""


CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
CITATION_BUTTON = r"""@misc{llm-perf-leaderboard,
  author = {Ilyas Moutawwakil, RΓ©gis Pierrard},
  title = {LLM-Perf Leaderboard},
  year = {2023},
  publisher = {Hugging Face},
  howpublished = "\url{https://huggingface.co/spaces/optimum/llm-perf-leaderboard}",
}
@software{optimum-benchmark,
  author = {Ilyas Moutawwakil, RΓ©gis Pierrard},
  publisher = {Hugging Face},
  title = {Optimum-Benchmark: A framework for benchmarking the performance of Transformers models with different hardwares, backends and optimizations.},
}
"""