|
import gradio as gr |
|
import pandas as pd |
|
import os |
|
import shutil |
|
|
|
|
|
DESCRIPTION = """ |
|
Independent performance benchmark of LLMs with various Inference Engines. Definitions are below the table. |
|
""" |
|
|
|
INTRODUCTION = """ |
|
**Introduction** |
|
In our ongoing quest to help developers find the right libraries and LLMs for their use cases. |
|
|
|
We tested them across six different inference engines (vLLM, TGI, TensorRT-LLM, Tritonvllm, Deepspeed-mii, ctranslate) on A100 GPUs hosted on Azure, ensuring a neutral playing field separate from our Inferless platform. |
|
The goal? |
|
To help developers, researchers, and AI enthusiasts pinpoint the best LLMs for their needs, whether for development or production. |
|
""" |
|
|
|
HOW_WE_TESTED = """ |
|
**How we tested?** |
|
Here's how we ensured consistent, reliable benchmarks: |
|
* **Platform:** All tests ran on A100 GPUs from Azure, providing a level playing field. |
|
* **Setup:** Docker containers for each library ensured a consistent environment. |
|
* **Configuration:** Standard settings (temperature 0.5, top_p 1) kept the focus on performance, not external variables. |
|
* **Prompts & Token Ranges:** We used six distinct prompts with input lengths from 20 to 2,000 tokens and tested generation lengths of 100, 200, and 500 tokens to evaluate each library's flexibility. |
|
* **Models & Libraries Tested:** We evaluated Phi-3-medium-128k-instruct, Meta-Llama-3.1-8B-Instruct, Mistral-7B-Instruct-v0.3, Qwen2-7B-Instruct, and Gemma-2-9b-it using Text Generation Inference (TGI), vLLM, DeepSpeed Mii, CTranslate2, Triton with vLLM Backend, and TensorRT-LLM. |
|
""" |
|
|
|
|
|
csv_folder_path = 'result_csv/' |
|
|
|
|
|
def read_and_process_csv_files(folder_path): |
|
all_data = [] |
|
for filename in os.listdir(folder_path): |
|
if filename.endswith('.csv'): |
|
file_path = os.path.join(folder_path, filename) |
|
df = pd.read_csv(file_path) |
|
all_data.append(df) |
|
|
|
combined_df = pd.concat(all_data, ignore_index=True) |
|
|
|
|
|
columns_order = [ |
|
"Model_Name", "Library", "TTFT", "Tokens-per-Second", "Token_Count", |
|
"Input_Tokens", "Output_Tokens", "Input", "Output" |
|
] |
|
|
|
|
|
for col in columns_order: |
|
if col not in combined_df.columns: |
|
combined_df[col] = pd.NA |
|
|
|
|
|
return combined_df[columns_order] |
|
|
|
df = read_and_process_csv_files(csv_folder_path) |
|
|
|
def get_leaderboard_df(): |
|
return df |
|
|
|
def add_new_entry(file): |
|
global df |
|
if file is None: |
|
return df, "No file uploaded." |
|
|
|
|
|
new_df = pd.read_csv(file.name) |
|
|
|
|
|
columns_order = [ |
|
"Model_Name", "Library", "TTFT", "Tokens-per-Second", "Token_Count", |
|
"Input_Tokens", "Output_Tokens", "Input", "Output" |
|
] |
|
for col in columns_order: |
|
if col not in new_df.columns: |
|
new_df[col] = pd.NA |
|
new_df = new_df[columns_order] |
|
|
|
|
|
df = pd.concat([df, new_df], ignore_index=True) |
|
|
|
|
|
filename = os.path.basename(file.name) |
|
destination = os.path.join(csv_folder_path, filename) |
|
shutil.copy(file.name, destination) |
|
|
|
return df, f"File '{filename}' uploaded and data added successfully!" |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# LLM Inference Leaderboard") |
|
|
|
|
|
with gr.Column(): |
|
gr.Markdown("---") |
|
gr.Markdown(DESCRIPTION) |
|
gr.Markdown(INTRODUCTION) |
|
gr.Markdown("---") |
|
|
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Leaderboard"): |
|
leaderboard = gr.DataFrame(df) |
|
|
|
with gr.TabItem("Add New Entry"): |
|
file_upload = gr.File(label="Upload CSV File") |
|
submit_button = gr.Button("Add Entry") |
|
result = gr.Markdown() |
|
|
|
|
|
with gr.Column(): |
|
gr.Markdown("---") |
|
gr.Markdown(HOW_WE_TESTED) |
|
|
|
submit_button.click( |
|
add_new_entry, |
|
inputs=[file_upload], |
|
outputs=[leaderboard, result] |
|
) |
|
|
|
demo.load(get_leaderboard_df, outputs=[leaderboard]) |
|
|
|
demo.launch() |