{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "73b1aa22-a1e3-4a1e-9dd2-042ab0f5939a", "metadata": { "tags": [] }, "outputs": [], "source": [ "import sys\n", "import json\n", "from getpass import getpass\n", "import subprocess\n", "import os\n", "from datetime import datetime\n", "import pandas as pd\n", "import numpy as np\n", "from huggingface_hub import notebook_login, create_inference_endpoint, list_inference_endpoints, whoami, get_inference_endpoint, get_token\n", "from pathlib import Path\n", "from tqdm.notebook import tqdm" ] }, { "cell_type": "code", "execution_count": null, "id": "772897cb-c2b1-4f9a-8143-ad64aed40b5b", "metadata": { "tags": [] }, "outputs": [], "source": [ "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "id": "8f951213-46a1-4db9-be2c-51c2291ecdc2", "metadata": { "tags": [] }, "outputs": [], "source": [ "proj_dir = Path.cwd()\n", "print(proj_dir)\n", "LLMPerf_path = proj_dir/'llmperf'" ] }, { "cell_type": "markdown", "id": "267ea96b-b756-4e16-b41a-fee2119edf76", "metadata": { "tags": [] }, "source": [ "# Config" ] }, { "cell_type": "code", "execution_count": null, "id": "2d3341f2-217e-42a5-89fb-1653fd418c48", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Endpoint\n", "ENDPOINT_NAME=\"tgi-benchmark-sp\"\n", "NAMESPACE = 'hf-test-lab'\n", "MODEL = 'meta-llama/Meta-Llama-3-8B-Instruct'\n", "INSTANCE_TYPE = 'nvidia-a100_2'\n", "\n", "# Simulation\n", "RESULTS_DIR = proj_dir/'tgi_benchmark_results'/INSTANCE_TYPE\n", "tgi_bss = [8, 16, 24, 32, 40, 48, 56, 64]" ] }, { "cell_type": "markdown", "id": "f6bbb792-b168-42b8-bff1-c6ea9f6daf79", "metadata": {}, "source": [ "# Endpoint setup" ] }, { "cell_type": "code", "execution_count": null, "id": "ae923833-8ca1-4d16-85be-a78ffb386c43", "metadata": { "tags": [] }, "outputs": [], "source": [ "def create_endpoint(MAX_BATCH_SIZE, name, instance_type):\n", " try:\n", " endpoint = get_inference_endpoint(name=name, namespace=NAMESPACE)\n", " endpoint.wait()\n", " return endpoint\n", " except:\n", " pass\n", " try:\n", " endpoint = create_inference_endpoint(\n", " name,\n", " repository=MODEL,\n", " task=\"text-generation\",\n", " framework=\"pytorch\",\n", " region=\"us-east-1\",\n", " vendor=\"aws\",\n", " accelerator=\"gpu\",\n", " instance_size=\"x1\",\n", " instance_type='nvidia-a100',\n", " min_replica=0,\n", " max_replica=1,\n", " namespace=NAMESPACE,\n", " custom_image={\n", " \"health_route\": \"/health\",\n", " \"env\": {\n", " \"MAX_INPUT_LENGTH\": \"3050\",\n", " \"MAX_TOTAL_TOKENS\": \"3300\",\n", " \"MAX_BATCH_SIZE\": f\"{MAX_BATCH_SIZE}\",\n", " \"HF_TOKEN\": get_token(),\n", " \"MODEL_ID\": \"/repository\",\n", " },\n", " \"url\": \"ghcr.io/huggingface/text-generation-inference:2.0.4\",\n", " },\n", " type=\"protected\",\n", " )\n", " endpoint.wait()\n", " except Exception as create_error:\n", " print(f\"Failed to create inference endpoint: {str(create_error)}\")\n", " return None\n", "\n", " return endpoint" ] }, { "cell_type": "code", "execution_count": null, "id": "491b82b3-4db8-4409-85ce-7c003a6c2f6f", "metadata": { "tags": [] }, "outputs": [], "source": [ "def run_command(batch_size, endpoint, tgi_bs):\n", " prefix = f'tgibs_{tgi_bs}__bs_{batch_size}'\n", " vu = batch_size\n", "\n", " # Set environment variables\n", " env = os.environ.copy()\n", " env['HUGGINGFACE_API_BASE'] = endpoint.url\n", " env['HUGGINGFACE_API_KEY'] = get_token()\n", " # Convert pathlib.Path to string and append to PYTHONPATH\n", " env['PYTHONPATH'] = str(LLMPerf_path) + (os.pathsep + env.get('PYTHONPATH', ''))\n", "\n", " # Define the benchmark script path\n", " benchmark_script = str(LLMPerf_path / \"token_benchmark_ray.py\")\n", "\n", " if not os.path.isfile(benchmark_script):\n", " print(f\"LLMPerf script not found at {benchmark_script}, please ensure the path is correct.\")\n", " return \"Script not found\", False\n", "\n", " # Calculate the max number of completed requests\n", " max_requests = vu * 8\n", "\n", " # Generate the results directory name\n", " date_str = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')\n", " results_dir = RESULTS_DIR / f\"{date_str}_{prefix}\"\n", "\n", " # Construct the command to run the benchmark script\n", " command = [\n", " \"python\", benchmark_script,\n", " \"--model\", f\"huggingface/{MODEL}\",\n", " \"--mean-input-tokens\", \"3000\",\n", " \"--stddev-input-tokens\", \"10\",\n", " \"--mean-output-tokens\", \"240\",\n", " \"--stddev-output-tokens\", \"5\",\n", " \"--max-num-completed-requests\", str(min(max_requests, 1500)),\n", " \"--timeout\", \"7200\",\n", " \"--num-concurrent-requests\", str(vu),\n", " \"--results-dir\", str(results_dir),\n", " \"--llm-api\", \"litellm\",\n", " \"--additional-sampling-params\", '{}'\n", " ]\n", "\n", " # Run the command with the modified environment\n", " try:\n", " result = subprocess.check_output(command, stderr=subprocess.STDOUT, env=env).decode('utf-8')\n", " return result, True\n", " except subprocess.CalledProcessError as e:\n", " print(f\"Error with batch size {batch_size}: {e.output.decode()}\")\n", " return e.output.decode(), False\n", "\n", "def find_max_working_batch_size(endpoint, tgi_bs):\n", " batch_sizes = [8, 16, 32, 64, 128, 256]\n", " max_working = None\n", " for size in tqdm(batch_sizes):\n", " tqdm.write(f\"Running: TGIBS {tgi_bs} Client Requests {size}\")\n", " output, success = run_command(size, endpoint, tgi_bs)\n", " if success:\n", " max_working = size\n", " else:\n", " break\n", " if max_working is None:\n", " return \"No working batch size found in the provided list\"\n", " return max_working" ] }, { "cell_type": "code", "execution_count": null, "id": "70a11c08-0bea-43d6-85eb-ef014473c9f1", "metadata": { "tags": [] }, "outputs": [], "source": [ "for tgi_bs in tqdm(tgi_bss):\n", " name = f\"{ENDPOINT_NAME}--tgibs-{tgi_bs}\"\n", " endpoint = create_endpoint(MAX_BATCH_SIZE=tgi_bs, name=name, instance_type=INSTANCE_TYPE) \n", " endpoint.wait()\n", " tqdm.write(f\"Endpoint Created: {name}\")\n", " max_batch_size = find_max_working_batch_size(endpoint=endpoint, tgi_bs=tgi_bs)\n", " endpoint.delete()\n", " tqdm.write(f\"Endpoint Deleted: {name}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "25ef390c-10fe-4466-b8fd-1c01730205d2", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.14" } }, "nbformat": 4, "nbformat_minor": 5 }