data_only_llm_perf_leaderboard

Runtime error

App Files Files Community

data_only_llm_perf_leaderboard / src /llm_perf.py

IlyasMoutawwakil HF staff

Update src/llm_perf.py

eeaa368 verified 11 months ago

raw

history blame

3.98 kB

	import os

	import pandas as pd
	from huggingface_hub import hf_hub_download

	from .utils import process_quantization_scheme, process_arch

	LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
	HF_TOKEN = os.environ.get("HF_TOKEN", None)

	COLUMNS_MAPPING = {
	"Model": "Model 🤗",
	"Arch": "Arch 🏛️",
	"Size": "Params (B)",
	"Score": "Open LLM Score (%)",
	# deployment settings
	"backend.name": "Backend 🏭",
	"backend.torch_dtype": "DType 📥",
	"optimization": "Optimization 🛠️",
	"quantization": "Quantization 🗜️",
	# primary measurements
	"forward.latency(s)": "Prefill Latency (s)",
	"decode.throughput(tokens/s)": "Decode Throughput (tokens/s)",
	"generate.max_memory_allocated(MB)": "Allocated Memory (MB)",
	"generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh)",
	# additional measurements
	"generate.latency(s)": "E2E Latency (s)",
	"generate.throughput(tokens/s)": "E2E Throughput (tokens/s)",
	"generate.max_memory_reserved(MB)": "Reserved Memory (MB)",
	"generate.max_memory_used(MB)": "Used Memory (MB)",
	}
	SORTING_COLUMNS = [
	"Open LLM Score (%)",
	"Prefill Latency (s)",
	"Decode Throughput (tokens/s)",
	]
	SORTING_ASCENDING = [False, True, False]


	def get_llm_df():
	# commented for now since scraping script is not working
	hf_hub_download(
	repo_id=LLM_PERF_DATASET_REPO,
	filename="open-llm.csv",
	local_dir="dataset",
	repo_type="dataset",
	token=HF_TOKEN,
	)

	llm_df = pd.read_csv("dataset/open-llm.csv")

	return llm_df


	def get_perf_df(machine: str = "hf-dgx-01"):
	hf_hub_download(
	repo_id=LLM_PERF_DATASET_REPO,
	filename=f"{machine}/perf-report.csv",
	local_dir="dataset",
	repo_type="dataset",
	token=HF_TOKEN,
	)
	perf_df = pd.read_csv(f"dataset/{machine}/perf-report.csv")

	return perf_df


	def get_llm_perf_df(machine: str = "hf-dgx-01"):
	# get dataframes
	llm_df = get_llm_df()
	perf_df = get_perf_df(machine=machine)
	llm_perf_df = pd.merge(llm_df, perf_df, left_on="Model", right_on="model")
	# some assertions
	assert llm_perf_df["benchmark.input_shapes.batch_size"].nunique() == 1
	assert llm_perf_df["benchmark.input_shapes.sequence_length"].nunique() == 1
	assert llm_perf_df["benchmark.new_tokens"].nunique() == 1
	# transpose energy consumption
	llm_perf_df["generate.energy_consumption(tokens/kWh)"] = (
	1 / llm_perf_df["generate.energy_consumption(kWh/token)"].fillna(1)
	).astype(int)
	# fix nan values
	llm_perf_df.loc[
	llm_perf_df["generate.energy_consumption(tokens/kWh)"] == 1,
	"generate.energy_consumption(tokens/kWh)",
	] = pd.NA

	# add optimization column
	llm_perf_df["optimization"] = llm_perf_df[["backend.to_bettertransformer", "backend.use_flash_attention_2"]].apply(
	lambda x: "BetterTransformer"
	if x["backend.to_bettertransformer"]
	else ("FlashAttentionV2" if x["backend.use_flash_attention_2"] else "None"),
	axis=1,
	)
	# add quantization scheme
	llm_perf_df["quantization"] = llm_perf_df[
	[
	"backend.quantization_scheme",
	"backend.quantization_config.bits",
	"backend.quantization_config.version",
	"backend.quantization_config.load_in_4bit",
	"backend.quantization_config.load_in_8bit",
	"backend.quantization_config.exllama_config.version",
	]
	].apply(lambda x: process_quantization_scheme(x), axis=1)
	# add arch
	llm_perf_df["Arch"] = llm_perf_df["Arch"].apply(process_arch)
	# filter columns
	llm_perf_df = llm_perf_df[list(COLUMNS_MAPPING.keys())]
	# rename columns
	llm_perf_df.rename(columns=COLUMNS_MAPPING, inplace=True)
	# sort by metric
	llm_perf_df.sort_values(
	by=SORTING_COLUMNS,
	ascending=SORTING_ASCENDING,
	inplace=True,
	)

	return llm_perf_df