{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "97626fd6-6fcc-4d8f-a5af-40441e46f98b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting vllm\n", " Downloading vllm-0.6.4.post1-cp38-abi3-manylinux1_x86_64.whl.metadata (10 kB)\n", "Requirement already satisfied: autoawq in /usr/local/lib/python3.11/dist-packages (0.2.7.post2)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.11/dist-packages (from vllm) (6.0.0)\n", "Collecting sentencepiece (from vllm)\n", " Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n", "Requirement already satisfied: numpy<2.0.0 in /usr/local/lib/python3.11/dist-packages (from vllm) (1.26.3)\n", "Requirement already satisfied: requests>=2.26.0 in /usr/local/lib/python3.11/dist-packages (from vllm) (2.32.3)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from vllm) (4.67.1)\n", "Collecting py-cpuinfo (from vllm)\n", " Downloading py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)\n", "Requirement already satisfied: transformers>=4.45.2 in /usr/local/lib/python3.11/dist-packages (from vllm) (4.47.0.dev0)\n", "Requirement already satisfied: tokenizers>=0.19.1 in /usr/local/lib/python3.11/dist-packages (from vllm) (0.20.3)\n", "Collecting protobuf (from vllm)\n", " Downloading protobuf-5.29.1-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.11/dist-packages (from vllm) (3.11.8)\n", "Collecting openai>=1.45.0 (from vllm)\n", " Downloading openai-1.57.0-py3-none-any.whl.metadata (24 kB)\n", "Collecting uvicorn[standard] (from vllm)\n", " Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)\n", "Collecting pydantic>=2.9 (from vllm)\n", " Downloading pydantic-2.10.3-py3-none-any.whl.metadata (172 kB)\n", "Requirement already satisfied: pillow in /usr/local/lib/python3.11/dist-packages (from vllm) (10.2.0)\n", "Requirement already satisfied: prometheus-client>=0.18.0 in /usr/local/lib/python3.11/dist-packages (from vllm) (0.21.0)\n", "Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)\n", " Downloading prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl.metadata (13 kB)\n", "Collecting tiktoken>=0.6.0 (from vllm)\n", " Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", "Collecting lm-format-enforcer<0.11,>=0.10.9 (from vllm)\n", " Downloading lm_format_enforcer-0.10.9-py3-none-any.whl.metadata (17 kB)\n", "Collecting outlines<0.1,>=0.0.43 (from vllm)\n", " Downloading outlines-0.0.46-py3-none-any.whl.metadata (15 kB)\n", "Collecting typing-extensions>=4.10 (from vllm)\n", " Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)\n", "Requirement already satisfied: filelock>=3.10.4 in /usr/local/lib/python3.11/dist-packages (from vllm) (3.13.1)\n", "Collecting partial-json-parser (from vllm)\n", " Downloading partial_json_parser-0.2.1.1.post4-py3-none-any.whl.metadata (6.2 kB)\n", "Requirement already satisfied: pyzmq in /usr/local/lib/python3.11/dist-packages (from vllm) (24.0.1)\n", "Collecting msgspec (from vllm)\n", " Downloading msgspec-0.18.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)\n", "Collecting gguf==0.10.0 (from vllm)\n", " Downloading gguf-0.10.0-py3-none-any.whl.metadata (3.5 kB)\n", "Requirement already satisfied: importlib-metadata in /usr/lib/python3/dist-packages (from vllm) (4.6.4)\n", "Collecting mistral-common>=1.5.0 (from mistral-common[opencv]>=1.5.0->vllm)\n", " Downloading mistral_common-1.5.1-py3-none-any.whl.metadata (4.6 kB)\n", "Requirement already satisfied: pyyaml in /usr/local/lib/python3.11/dist-packages (from vllm) (6.0.2)\n", "Collecting einops (from vllm)\n", " Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)\n", "Collecting compressed-tensors==0.8.0 (from vllm)\n", " Downloading compressed_tensors-0.8.0-py3-none-any.whl.metadata (6.8 kB)\n", "Collecting ray>=2.9 (from vllm)\n", " Downloading ray-2.40.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (17 kB)\n", "Collecting nvidia-ml-py>=12.560.30 (from vllm)\n", " Downloading nvidia_ml_py-12.560.30-py3-none-any.whl.metadata (8.6 kB)\n", "Collecting torch==2.5.1 (from vllm)\n", " Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)\n", "Collecting torchvision==0.20.1 (from vllm)\n", " Downloading torchvision-0.20.1-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)\n", "Collecting xformers==0.0.28.post3 (from vllm)\n", " Downloading xformers-0.0.28.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)\n", "Collecting fastapi!=0.113.*,!=0.114.0,>=0.107.0 (from vllm)\n", " Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.11/dist-packages (from torch==2.5.1->vllm) (3.2.1)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.11/dist-packages (from torch==2.5.1->vllm) (3.1.3)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.11/dist-packages (from torch==2.5.1->vllm) (2024.2.0)\n", "Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->vllm)\n", " Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1->vllm)\n", " Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.1->vllm)\n", " Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /usr/local/lib/python3.11/dist-packages (from torch==2.5.1->vllm) (9.1.0.70)\n", "Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.5.1->vllm)\n", " Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.5.1->vllm)\n", " Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-curand-cu12==10.3.5.147 (from torch==2.5.1->vllm)\n", " Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch==2.5.1->vllm)\n", " Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch==2.5.1->vllm)\n", " Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)\n", "Collecting nvidia-nccl-cu12==2.21.5 (from torch==2.5.1->vllm)\n", " Downloading nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)\n", "Collecting nvidia-nvtx-cu12==12.4.127 (from torch==2.5.1->vllm)\n", " Downloading nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.7 kB)\n", "Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch==2.5.1->vllm)\n", " Downloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)\n", "Collecting triton==3.1.0 (from torch==2.5.1->vllm)\n", " Downloading triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)\n", "Collecting sympy==1.13.1 (from torch==2.5.1->vllm)\n", " Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)\n", "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.11/dist-packages (from sympy==1.13.1->torch==2.5.1->vllm) (1.3.0)\n", "Requirement already satisfied: accelerate in /usr/local/lib/python3.11/dist-packages (from autoawq) (1.1.1)\n", "Requirement already satisfied: datasets>=2.20 in /usr/local/lib/python3.11/dist-packages (from autoawq) (3.1.0)\n", "Requirement already satisfied: zstandard in /usr/local/lib/python3.11/dist-packages (from autoawq) (0.23.0)\n", "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (18.1.0)\n", "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (0.3.8)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (2.2.3)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (3.5.0)\n", "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (0.70.16)\n", "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (0.26.3)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.11/dist-packages (from datasets>=2.20->autoawq) (24.1)\n", "Collecting starlette<0.42.0,>=0.40.0 (from fastapi!=0.113.*,!=0.114.0,>=0.107.0->vllm)\n", " Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)\n", "Collecting interegular>=0.3.2 (from lm-format-enforcer<0.11,>=0.10.9->vllm)\n", " Downloading interegular-0.3.3-py37-none-any.whl.metadata (3.0 kB)\n", "Requirement already satisfied: jsonschema<5.0.0,>=4.21.1 in /usr/local/lib/python3.11/dist-packages (from mistral-common>=1.5.0->mistral-common[opencv]>=1.5.0->vllm) (4.23.0)\n", "Collecting pillow (from vllm)\n", " Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.2 kB)\n", "Collecting tiktoken>=0.6.0 (from vllm)\n", " Downloading tiktoken-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", "Collecting opencv-python-headless<5.0.0,>=4.0.0 (from mistral-common[opencv]>=1.5.0->vllm)\n", " Downloading opencv_python_headless-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)\n", "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.11/dist-packages (from openai>=1.45.0->vllm) (4.6.0)\n", "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai>=1.45.0->vllm) (1.7.0)\n", "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from openai>=1.45.0->vllm) (0.27.2)\n", "Collecting jiter<1,>=0.4.0 (from openai>=1.45.0->vllm)\n", " Downloading jiter-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)\n", "Requirement already satisfied: sniffio in /usr/local/lib/python3.11/dist-packages (from openai>=1.45.0->vllm) (1.3.1)\n", "Collecting lark (from outlines<0.1,>=0.0.43->vllm)\n", " Downloading lark-1.2.2-py3-none-any.whl.metadata (1.8 kB)\n", "Requirement already satisfied: nest-asyncio in /usr/local/lib/python3.11/dist-packages (from outlines<0.1,>=0.0.43->vllm) (1.6.0)\n", "Collecting cloudpickle (from outlines<0.1,>=0.0.43->vllm)\n", " Downloading cloudpickle-3.1.0-py3-none-any.whl.metadata (7.0 kB)\n", "Collecting diskcache (from outlines<0.1,>=0.0.43->vllm)\n", " Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)\n", "Collecting numba (from outlines<0.1,>=0.0.43->vllm)\n", " Downloading numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)\n", "Requirement already satisfied: referencing in /usr/local/lib/python3.11/dist-packages (from outlines<0.1,>=0.0.43->vllm) (0.35.1)\n", "Collecting pycountry (from outlines<0.1,>=0.0.43->vllm)\n", " Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)\n", "Collecting pyairports (from outlines<0.1,>=0.0.43->vllm)\n", " Downloading pyairports-2.1.1-py3-none-any.whl.metadata (1.7 kB)\n", "Collecting annotated-types>=0.6.0 (from pydantic>=2.9->vllm)\n", " Downloading annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)\n", "Collecting pydantic-core==2.27.1 (from pydantic>=2.9->vllm)\n", " Downloading pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n", "Collecting click>=7.0 (from ray>=2.9->vllm)\n", " Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)\n", "Collecting msgpack<2.0.0,>=1.0.0 (from ray>=2.9->vllm)\n", " Downloading msgpack-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)\n", "Requirement already satisfied: aiosignal in /usr/local/lib/python3.11/dist-packages (from ray>=2.9->vllm) (1.3.1)\n", "Requirement already satisfied: frozenlist in /usr/local/lib/python3.11/dist-packages (from ray>=2.9->vllm) (1.5.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.26.0->vllm) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.26.0->vllm) (3.10)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests>=2.26.0->vllm) (2.2.3)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.26.0->vllm) (2024.8.30)\n", "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.11/dist-packages (from tiktoken>=0.6.0->vllm) (2024.11.6)\n", "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.11/dist-packages (from transformers>=4.45.2->vllm) (0.4.5)\n", "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (2.4.3)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (24.2.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (6.1.0)\n", "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (0.2.0)\n", "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.11/dist-packages (from aiohttp->vllm) (1.18.0)\n", "Requirement already satisfied: h11>=0.8 in /usr/local/lib/python3.11/dist-packages (from uvicorn[standard]->vllm) (0.14.0)\n", "Collecting httptools>=0.6.3 (from uvicorn[standard]->vllm)\n", " Downloading httptools-0.6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)\n", "Collecting python-dotenv>=0.13 (from uvicorn[standard]->vllm)\n", " Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)\n", "Collecting uvloop!=0.15.0,!=0.15.1,>=0.14.0 (from uvicorn[standard]->vllm)\n", " Downloading uvloop-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n", "Collecting watchfiles>=0.13 (from uvicorn[standard]->vllm)\n", " Downloading watchfiles-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)\n", "Collecting websockets>=10.4 (from uvicorn[standard]->vllm)\n", " Downloading websockets-14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n", "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.11/dist-packages (from httpx<1,>=0.23.0->openai>=1.45.0->vllm) (1.0.5)\n", "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /usr/local/lib/python3.11/dist-packages (from jsonschema<5.0.0,>=4.21.1->mistral-common>=1.5.0->mistral-common[opencv]>=1.5.0->vllm) (2023.12.1)\n", "Requirement already satisfied: rpds-py>=0.7.1 in /usr/local/lib/python3.11/dist-packages (from jsonschema<5.0.0,>=4.21.1->mistral-common>=1.5.0->mistral-common[opencv]>=1.5.0->vllm) (0.20.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.11/dist-packages (from jinja2->torch==2.5.1->vllm) (2.1.5)\n", "Collecting llvmlite<0.44,>=0.43.0dev0 (from numba->outlines<0.1,>=0.0.43->vllm)\n", " Downloading llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets>=2.20->autoawq) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets>=2.20->autoawq) (2024.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas->datasets>=2.20->autoawq) (2024.2)\n", "Requirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas->datasets>=2.20->autoawq) (1.16.0)\n", "Downloading vllm-0.6.4.post1-cp38-abi3-manylinux1_x86_64.whl (198.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m198.9/198.9 MB\u001b[0m \u001b[31m104.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading compressed_tensors-0.8.0-py3-none-any.whl (86 kB)\n", "Downloading gguf-0.10.0-py3-none-any.whl (71 kB)\n", "Downloading torch-2.5.1-cp311-cp311-manylinux1_x86_64.whl (906.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m906.5/906.5 MB\u001b[0m \u001b[31m107.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading torchvision-0.20.1-cp311-cp311-manylinux1_x86_64.whl (7.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.2/7.2 MB\u001b[0m \u001b[31m164.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading xformers-0.0.28.post3-cp311-cp311-manylinux_2_28_x86_64.whl (16.7 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.7/16.7 MB\u001b[0m \u001b[31m150.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m138.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m132.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m138.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m117.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl (211.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m109.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl (56.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m135.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m148.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl (207.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m154.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl (188.7 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m188.7/188.7 MB\u001b[0m \u001b[31m150.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m134.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (99 kB)\n", "Downloading sympy-1.13.1-py3-none-any.whl (6.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m165.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading triton-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.5/209.5 MB\u001b[0m \u001b[31m142.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading fastapi-0.115.6-py3-none-any.whl (94 kB)\n", "Downloading lm_format_enforcer-0.10.9-py3-none-any.whl (43 kB)\n", "Downloading mistral_common-1.5.1-py3-none-any.whl (6.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.5/6.5 MB\u001b[0m \u001b[31m145.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m180.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading nvidia_ml_py-12.560.30-py3-none-any.whl (40 kB)\n", "Downloading openai-1.57.0-py3-none-any.whl (389 kB)\n", "Downloading outlines-0.0.46-py3-none-any.whl (101 kB)\n", "Downloading pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl (4.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m152.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl (19 kB)\n", "Downloading pydantic-2.10.3-py3-none-any.whl (456 kB)\n", "Downloading pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m212.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading ray-2.40.0-cp311-cp311-manylinux2014_x86_64.whl (67.0 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 MB\u001b[0m \u001b[31m172.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hDownloading protobuf-5.29.1-cp38-abi3-manylinux2014_x86_64.whl (319 kB)\n", "Downloading tiktoken-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m153.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading typing_extensions-4.12.2-py3-none-any.whl (37 kB)\n", "Downloading einops-0.8.0-py3-none-any.whl (43 kB)\n", "Downloading msgspec-0.18.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209 kB)\n", "Downloading partial_json_parser-0.2.1.1.post4-py3-none-any.whl (9.9 kB)\n", "Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\n", "Downloading annotated_types-0.7.0-py3-none-any.whl (13 kB)\n", "Downloading click-8.1.7-py3-none-any.whl (97 kB)\n", "Downloading httptools-0.6.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (459 kB)\n", "Downloading interegular-0.3.3-py37-none-any.whl (23 kB)\n", "Downloading jiter-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (343 kB)\n", "Downloading msgpack-1.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (403 kB)\n", "Downloading opencv_python_headless-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.9/49.9 MB\u001b[0m \u001b[31m155.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hDownloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)\n", "Downloading starlette-0.41.3-py3-none-any.whl (73 kB)\n", "Downloading uvloop-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.0 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.0/4.0 MB\u001b[0m \u001b[31m157.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading watchfiles-1.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (442 kB)\n", "Downloading websockets-14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (168 kB)\n", "Downloading cloudpickle-3.1.0-py3-none-any.whl (22 kB)\n", "Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)\n", "Downloading lark-1.2.2-py3-none-any.whl (111 kB)\n", "Downloading numba-0.60.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.7 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.7/3.7 MB\u001b[0m \u001b[31m138.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pyairports-2.1.1-py3-none-any.whl (371 kB)\n", "Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.3/6.3 MB\u001b[0m \u001b[31m137.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading uvicorn-0.32.1-py3-none-any.whl (63 kB)\n", "Downloading llvmlite-0.43.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.9/43.9 MB\u001b[0m \u001b[31m169.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", "\u001b[?25hInstalling collected packages: sentencepiece, pyairports, py-cpuinfo, nvidia-ml-py, websockets, uvloop, typing-extensions, triton, sympy, python-dotenv, pycountry, protobuf, pillow, partial-json-parser, opencv-python-headless, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, msgspec, msgpack, llvmlite, lark, jiter, interegular, httptools, gguf, einops, diskcache, cloudpickle, click, annotated-types, watchfiles, uvicorn, tiktoken, starlette, pydantic-core, nvidia-cusparse-cu12, numba, pydantic, prometheus-fastapi-instrumentator, nvidia-cusolver-cu12, torch, ray, openai, mistral-common, lm-format-enforcer, fastapi, xformers, torchvision, outlines, compressed-tensors, vllm\n", " Attempting uninstall: typing-extensions\n", " Found existing installation: typing_extensions 4.9.0\n", " Uninstalling typing_extensions-4.9.0:\n", " Successfully uninstalled typing_extensions-4.9.0\n", " Attempting uninstall: triton\n", " Found existing installation: triton 3.0.0\n", " Uninstalling triton-3.0.0:\n", " Successfully uninstalled triton-3.0.0\n", " Attempting uninstall: sympy\n", " Found existing installation: sympy 1.12\n", " Uninstalling sympy-1.12:\n", " Successfully uninstalled sympy-1.12\n", " Attempting uninstall: pillow\n", " Found existing installation: pillow 10.2.0\n", " Uninstalling pillow-10.2.0:\n", " Successfully uninstalled pillow-10.2.0\n", " Attempting uninstall: nvidia-nvtx-cu12\n", " Found existing installation: nvidia-nvtx-cu12 12.4.99\n", " Uninstalling nvidia-nvtx-cu12-12.4.99:\n", " Successfully uninstalled nvidia-nvtx-cu12-12.4.99\n", " Attempting uninstall: nvidia-nvjitlink-cu12\n", " Found existing installation: nvidia-nvjitlink-cu12 12.4.99\n", " Uninstalling nvidia-nvjitlink-cu12-12.4.99:\n", " Successfully uninstalled nvidia-nvjitlink-cu12-12.4.99\n", " Attempting uninstall: nvidia-nccl-cu12\n", " Found existing installation: nvidia-nccl-cu12 2.20.5\n", " Uninstalling nvidia-nccl-cu12-2.20.5:\n", " Successfully uninstalled nvidia-nccl-cu12-2.20.5\n", " Attempting uninstall: nvidia-curand-cu12\n", " Found existing installation: nvidia-curand-cu12 10.3.5.119\n", " Uninstalling nvidia-curand-cu12-10.3.5.119:\n", " Successfully uninstalled nvidia-curand-cu12-10.3.5.119\n", " Attempting uninstall: nvidia-cufft-cu12\n", " Found existing installation: nvidia-cufft-cu12 11.2.0.44\n", " Uninstalling nvidia-cufft-cu12-11.2.0.44:\n", " Successfully uninstalled nvidia-cufft-cu12-11.2.0.44\n", " Attempting uninstall: nvidia-cuda-runtime-cu12\n", " Found existing installation: nvidia-cuda-runtime-cu12 12.4.99\n", " Uninstalling nvidia-cuda-runtime-cu12-12.4.99:\n", " Successfully uninstalled nvidia-cuda-runtime-cu12-12.4.99\n", " Attempting uninstall: nvidia-cuda-nvrtc-cu12\n", " Found existing installation: nvidia-cuda-nvrtc-cu12 12.4.99\n", " Uninstalling nvidia-cuda-nvrtc-cu12-12.4.99:\n", " Successfully uninstalled nvidia-cuda-nvrtc-cu12-12.4.99\n", " Attempting uninstall: nvidia-cuda-cupti-cu12\n", " Found existing installation: nvidia-cuda-cupti-cu12 12.4.99\n", " Uninstalling nvidia-cuda-cupti-cu12-12.4.99:\n", " Successfully uninstalled nvidia-cuda-cupti-cu12-12.4.99\n", " Attempting uninstall: nvidia-cublas-cu12\n", " Found existing installation: nvidia-cublas-cu12 12.4.2.65\n", " Uninstalling nvidia-cublas-cu12-12.4.2.65:\n", " Successfully uninstalled nvidia-cublas-cu12-12.4.2.65\n", " Attempting uninstall: nvidia-cusparse-cu12\n", " Found existing installation: nvidia-cusparse-cu12 12.3.0.142\n", " Uninstalling nvidia-cusparse-cu12-12.3.0.142:\n", " Successfully uninstalled nvidia-cusparse-cu12-12.3.0.142\n", " Attempting uninstall: nvidia-cusolver-cu12\n", " Found existing installation: nvidia-cusolver-cu12 11.6.0.99\n", " Uninstalling nvidia-cusolver-cu12-11.6.0.99:\n", " Successfully uninstalled nvidia-cusolver-cu12-11.6.0.99\n", " Attempting uninstall: torch\n", " Found existing installation: torch 2.4.1+cu124\n", " Uninstalling torch-2.4.1+cu124:\n", " Successfully uninstalled torch-2.4.1+cu124\n", " Attempting uninstall: torchvision\n", " Found existing installation: torchvision 0.19.1+cu124\n", " Uninstalling torchvision-0.19.1+cu124:\n", " Successfully uninstalled torchvision-0.19.1+cu124\n", "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "torchaudio 2.4.1+cu124 requires torch==2.4.1, but you have torch 2.5.1 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0mSuccessfully installed annotated-types-0.7.0 click-8.1.7 cloudpickle-3.1.0 compressed-tensors-0.8.0 diskcache-5.6.3 einops-0.8.0 fastapi-0.115.6 gguf-0.10.0 httptools-0.6.4 interegular-0.3.3 jiter-0.8.0 lark-1.2.2 llvmlite-0.43.0 lm-format-enforcer-0.10.9 mistral-common-1.5.1 msgpack-1.1.0 msgspec-0.18.6 numba-0.60.0 nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-ml-py-12.560.30 nvidia-nccl-cu12-2.21.5 nvidia-nvjitlink-cu12-12.4.127 nvidia-nvtx-cu12-12.4.127 openai-1.57.0 opencv-python-headless-4.10.0.84 outlines-0.0.46 partial-json-parser-0.2.1.1.post4 pillow-10.4.0 prometheus-fastapi-instrumentator-7.0.0 protobuf-5.29.1 py-cpuinfo-9.0.0 pyairports-2.1.1 pycountry-24.6.1 pydantic-2.10.3 pydantic-core-2.27.1 python-dotenv-1.0.1 ray-2.40.0 sentencepiece-0.2.0 starlette-0.41.3 sympy-1.13.1 tiktoken-0.7.0 torch-2.5.1 torchvision-0.20.1 triton-3.1.0 typing-extensions-4.12.2 uvicorn-0.32.1 uvloop-0.21.0 vllm-0.6.4.post1 watchfiles-1.0.0 websockets-14.1 xformers-0.0.28.post3\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n", "\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n" ] } ], "source": [ "!pip install vllm autoawq" ] }, { "cell_type": "code", "execution_count": 1, "id": "56a955b4-c65a-4146-9281-ebcb4ee81209", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Sun Dec 8 00:57:27 2024 \n", "+-----------------------------------------------------------------------------------------+\n", "| NVIDIA-SMI 565.57.01 Driver Version: 565.57.01 CUDA Version: 12.7 |\n", "|-----------------------------------------+------------------------+----------------------+\n", "| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", "| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", "| | | MIG M. |\n", "|=========================================+========================+======================|\n", "| 0 NVIDIA H100 NVL On | 00000000:AE:00.0 Off | 0 |\n", "| N/A 32C P0 60W / 310W | 1MiB / 95830MiB | 0% Default |\n", "| | | Disabled |\n", "+-----------------------------------------+------------------------+----------------------+\n", " \n", "+-----------------------------------------------------------------------------------------+\n", "| Processes: |\n", "| GPU GI CI PID Type Process name GPU Memory |\n", "| ID ID Usage |\n", "|=========================================================================================|\n", "| No running processes found |\n", "+-----------------------------------------------------------------------------------------+\n" ] } ], "source": [ "!nvidia-smi" ] }, { "cell_type": "code", "execution_count": 1, "id": "8f40dc17-a05a-466f-85d9-5bfa473c133b", "metadata": {}, "outputs": [], "source": [ "from vllm import LLM, SamplingParams" ] }, { "cell_type": "code", "execution_count": 2, "id": "ecf09ccb-b47b-40bc-a501-ded94239465d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO 12-08 01:02:42 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.\n", "INFO 12-08 01:02:42 awq_marlin.py:113] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference\n", "WARNING 12-08 01:02:42 config.py:428] awq quantization is not fully optimized yet. The speed can be slower than non-quantized models.\n", "WARNING 12-08 01:02:42 arg_utils.py:1013] Chunked prefill is enabled by default for models with max_model_len > 32K. Currently, chunked prefill might not work with some features or models. If you encounter any issues, please disable chunked prefill by setting --enable-chunked-prefill=False.\n", "INFO 12-08 01:02:42 config.py:1136] Chunked prefill is enabled with max_num_batched_tokens=512.\n", "INFO 12-08 01:02:42 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='kishizaki-sci/Llama-3.3-70B-Instruct-AWQ-4bit-JP-EN', speculative_config=None, tokenizer='kishizaki-sci/Llama-3.3-70B-Instruct-AWQ-4bit-JP-EN', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=kishizaki-sci/Llama-3.3-70B-Instruct-AWQ-4bit-JP-EN, num_scheduler_steps=1, chunked_prefill_enabled=True multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)\n", "INFO 12-08 01:02:44 selector.py:135] Using Flash Attention backend.\n", "INFO 12-08 01:02:44 model_runner.py:1072] Starting to load model kishizaki-sci/Llama-3.3-70B-Instruct-AWQ-4bit-JP-EN...\n", "INFO 12-08 01:02:45 weight_utils.py:243] Using model weights format ['*.safetensors']\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1b29444155504b5eaadb35d628775f18", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Loading safetensors checkpoint shards: 0% Completed | 0/9 [00:00