Spaces:

sofianhw
/

test-docker

Paused

App Files Files Community

sofianhw commited on Jun 6

Commit

75da468

•

1 Parent(s): b7d4623

new vLLM

Browse files

Files changed (4) hide show

api_server.py +112 -79
protocol.py +501 -66
serving_completion.py +453 -248
serving_engine.py +143 -76

api_server.py CHANGED Viewed

@@ -1,30 +1,44 @@
-import argparse
 import asyncio
-import json
 from contextlib import asynccontextmanager
-from aioprometheus import MetricsMiddleware
-from aioprometheus.asgi.starlette import metrics
 import fastapi
 import uvicorn
-from http import HTTPStatus
 from fastapi import Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, StreamingResponse, Response
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.engine.metrics import add_global_metrics_labels
-from protocol import CompletionRequest, ChatCompletionRequest, ErrorResponse
 from vllm.logger import init_logger
-from serving_chat import OpenAIServingChat
-from serving_completion import OpenAIServingCompletion
 TIMEOUT_KEEP_ALIVE = 5  # seconds
-openai_serving_chat: OpenAIServingChat = None
-openai_serving_completion: OpenAIServingCompletion = None
-logger = init_logger(__name__)
 @asynccontextmanager
@@ -36,7 +50,9 @@ async def lifespan(app: fastapi.FastAPI):
             await engine.do_log_stats()
     if not engine_args.disable_log_stats:
-        asyncio.create_task(_force_log())
     yield
@@ -45,62 +61,15 @@ app = fastapi.FastAPI(lifespan=lifespan)
 def parse_args():
-    parser = argparse.ArgumentParser(
-        description="vLLM OpenAI-Compatible RESTful API server.")
-    parser.add_argument("--host", type=str, default=None, help="host name")
-    parser.add_argument("--port", type=int, default=8000, help="port number")
-    parser.add_argument("--allow-credentials",
-                        action="store_true",
-                        help="allow credentials")
-    parser.add_argument("--allowed-origins",
-                        type=json.loads,
-                        default=["*"],
-                        help="allowed origins")
-    parser.add_argument("--allowed-methods",
-                        type=json.loads,
-                        default=["*"],
-                        help="allowed methods")
-    parser.add_argument("--allowed-headers",
-                        type=json.loads,
-                        default=["*"],
-                        help="allowed headers")
-    parser.add_argument("--served-model-name",
-                        type=str,
-                        default=None,
-                        help="The model name used in the API. If not "
-                        "specified, the model name will be the same as "
-                        "the huggingface name.")
-    parser.add_argument("--chat-template",
-                        type=str,
-                        default=None,
-                        help="The file path to the chat template, "
-                        "or the template in single-line form "
-                        "for the specified model")
-    parser.add_argument("--response-role",
-                        type=str,
-                        default="assistant",
-                        help="The role name to return if "
-                        "`request.add_generation_prompt=true`.")
-    parser.add_argument("--ssl-keyfile",
-                        type=str,
-                        default=None,
-                        help="The file path to the SSL key file")
-    parser.add_argument("--ssl-certfile",
-                        type=str,
-                        default=None,
-                        help="The file path to the SSL cert file")
-    parser.add_argument(
-        "--root-path",
-        type=str,
-        default=None,
-        help="FastAPI root_path when app is behind a path based routing proxy")
-    parser = AsyncEngineArgs.add_cli_args(parser)
     return parser.parse_args()
-app.add_middleware(MetricsMiddleware)  # Trace HTTP server metrics
-app.add_route("/metrics", metrics)  # Exposes HTTP metrics
 @app.exception_handler(RequestValidationError)
@@ -112,6 +81,7 @@ async def validation_exception_handler(_, exc):
 @app.get("/health")
 async def health() -> Response:
     """Health check."""
     return Response(status_code=200)
@@ -121,6 +91,12 @@ async def show_available_models():
     return JSONResponse(content=models.model_dump())
 @app.post("/api/v1/chat/completions")
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
@@ -133,6 +109,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
         return StreamingResponse(content=generator,
                                  media_type="text/event-stream")
     else:
         return JSONResponse(content=generator.model_dump())
@@ -150,6 +127,17 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
         return JSONResponse(content=generator.model_dump())
 if __name__ == "__main__":
     args = parse_args()
@@ -161,28 +149,73 @@ if __name__ == "__main__":
         allow_headers=args.allowed_headers,
     )
-    logger.info(f"args: {args}")
     if args.served_model_name is not None:
-        served_model = args.served_model_name
     else:
-        served_model = args.model
     engine_args = AsyncEngineArgs.from_cli_args(args)
-    engine = AsyncLLMEngine.from_engine_args(engine_args)
-    openai_serving_chat = OpenAIServingChat(engine, served_model,
                                             args.response_role,
                                             args.chat_template)
-    openai_serving_completion = OpenAIServingCompletion(engine, served_model)
-    # Register labels for metrics
-    add_global_metrics_labels(model_name=engine_args.model)
     app.root_path = args.root_path
     uvicorn.run(app,
                 host=args.host,
                 port=args.port,
-                log_level="info",
                 timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
                 ssl_keyfile=args.ssl_keyfile,
-                ssl_certfile=args.ssl_certfile)

 import asyncio
+import importlib
+import inspect
+import re
 from contextlib import asynccontextmanager
+from http import HTTPStatus
+from typing import Optional, Set
 import fastapi
 import uvicorn
 from fastapi import Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, Response, StreamingResponse
+from prometheus_client import make_asgi_app
+from starlette.routing import Mount
+import vllm
+import vllm.envs as envs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              ChatCompletionResponse,
+                                              CompletionRequest,
+                                              EmbeddingRequest, ErrorResponse)
+from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
+from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
+from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
 from vllm.logger import init_logger
+from vllm.usage.usage_lib import UsageContext
 TIMEOUT_KEEP_ALIVE = 5  # seconds
+openai_serving_chat: OpenAIServingChat
+openai_serving_completion: OpenAIServingCompletion
+openai_serving_embedding: OpenAIServingEmbedding
+logger = init_logger('vllm.entrypoints.openai.api_server')
+_running_tasks: Set[asyncio.Task] = set()
 @asynccontextmanager
             await engine.do_log_stats()
     if not engine_args.disable_log_stats:
+        task = asyncio.create_task(_force_log())
+        _running_tasks.add(task)
+        task.add_done_callback(_running_tasks.remove)
     yield
 def parse_args():
+    parser = make_arg_parser()
     return parser.parse_args()
+# Add prometheus asgi middleware to route /metrics requests
+route = Mount("/metrics", make_asgi_app())
+# Workaround for 307 Redirect for /metrics
+route.path_regex = re.compile('^/metrics(?P<path>.*)$')
+app.routes.append(route)
 @app.exception_handler(RequestValidationError)
 @app.get("/health")
 async def health() -> Response:
     """Health check."""
+    await openai_serving_chat.engine.check_health()
     return Response(status_code=200)
     return JSONResponse(content=models.model_dump())
+@app.get("/version")
+async def show_version():
+    ver = {"version": vllm.__version__}
+    return JSONResponse(content=ver)
 @app.post("/api/v1/chat/completions")
 async def create_chat_completion(request: ChatCompletionRequest,
                                  raw_request: Request):
         return StreamingResponse(content=generator,
                                  media_type="text/event-stream")
     else:
+        assert isinstance(generator, ChatCompletionResponse)
         return JSONResponse(content=generator.model_dump())
         return JSONResponse(content=generator.model_dump())
+@app.post("/api/v1/embeddings")
+async def create_embedding(request: EmbeddingRequest, raw_request: Request):
+    generator = await openai_serving_embedding.create_embedding(
+        request, raw_request)
+    if isinstance(generator, ErrorResponse):
+        return JSONResponse(content=generator.model_dump(),
+                            status_code=generator.code)
+    else:
+        return JSONResponse(content=generator.model_dump())
 if __name__ == "__main__":
     args = parse_args()
         allow_headers=args.allowed_headers,
     )
+    if token := envs.VLLM_API_KEY or args.api_key:
+        @app.middleware("http")
+        async def authentication(request: Request, call_next):
+            root_path = "" if args.root_path is None else args.root_path
+            if request.method == "OPTIONS":
+                return await call_next(request)
+            if not request.url.path.startswith(f"{root_path}/v1"):
+                return await call_next(request)
+            if request.headers.get("Authorization") != "Bearer " + token:
+                return JSONResponse(content={"error": "Unauthorized"},
+                                    status_code=401)
+            return await call_next(request)
+    for middleware in args.middleware:
+        module_path, object_name = middleware.rsplit(".", 1)
+        imported = getattr(importlib.import_module(module_path), object_name)
+        if inspect.isclass(imported):
+            app.add_middleware(imported)
+        elif inspect.iscoroutinefunction(imported):
+            app.middleware("http")(imported)
+        else:
+            raise ValueError(f"Invalid middleware {middleware}. "
+                             f"Must be a function or a class.")
+    logger.info("vLLM API server version %s", vllm.__version__)
+    logger.info("args: %s", args)
     if args.served_model_name is not None:
+        served_model_names = args.served_model_name
     else:
+        served_model_names = [args.model]
     engine_args = AsyncEngineArgs.from_cli_args(args)
+    engine = AsyncLLMEngine.from_engine_args(
+        engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
+    event_loop: Optional[asyncio.AbstractEventLoop]
+    try:
+        event_loop = asyncio.get_running_loop()
+    except RuntimeError:
+        event_loop = None
+    if event_loop is not None and event_loop.is_running():
+        # If the current is instanced by Ray Serve,
+        # there is already a running event loop
+        model_config = event_loop.run_until_complete(engine.get_model_config())
+    else:
+        # When using single vLLM without engine_use_ray
+        model_config = asyncio.run(engine.get_model_config())
+    openai_serving_chat = OpenAIServingChat(engine, model_config,
+                                            served_model_names,
                                             args.response_role,
+                                            args.lora_modules,
                                             args.chat_template)
+    openai_serving_completion = OpenAIServingCompletion(
+        engine, model_config, served_model_names, args.lora_modules)
+    openai_serving_embedding = OpenAIServingEmbedding(engine, model_config,
+                                                      served_model_names)
     app.root_path = args.root_path
     uvicorn.run(app,
                 host=args.host,
                 port=args.port,
+                log_level=args.uvicorn_log_level,
                 timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
                 ssl_keyfile=args.ssl_keyfile,
+                ssl_certfile=args.ssl_certfile,
+                ssl_ca_certs=args.ssl_ca_certs,
+                ssl_cert_reqs=args.ssl_cert_reqs)

protocol.py CHANGED Viewed

@@ -1,15 +1,58 @@
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
 import time
-from typing import Dict, List, Literal, Optional, Union
-from pydantic import BaseModel, Field
-from vllm.utils import random_uuid
 from vllm.sampling_params import SamplingParams
-class ErrorResponse(BaseModel):
     object: str = "error"
     message: str
     type: str
@@ -17,7 +60,7 @@ class ErrorResponse(BaseModel):
     code: int
-class ModelPermission(BaseModel):
     id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
     object: str = "model_permission"
     created: int = Field(default_factory=lambda: int(time.time()))
@@ -29,57 +72,176 @@ class ModelPermission(BaseModel):
     allow_fine_tuning: bool = False
     organization: str = "*"
     group: Optional[str] = None
-    is_blocking: str = False
-class ModelCard(BaseModel):
     id: str
     object: str = "model"
     created: int = Field(default_factory=lambda: int(time.time()))
     owned_by: str = "vllm"
     root: Optional[str] = None
     parent: Optional[str] = None
     permission: List[ModelPermission] = Field(default_factory=list)
-class ModelList(BaseModel):
     object: str = "list"
     data: List[ModelCard] = Field(default_factory=list)
-class UsageInfo(BaseModel):
     prompt_tokens: int = 0
     total_tokens: int = 0
     completion_tokens: Optional[int] = 0
-class ChatCompletionRequest(BaseModel):
     model: str
-    messages: Union[str, List[Dict[str, str]]]
-    temperature: Optional[float] = 0.7
-    top_p: Optional[float] = 1.0
-    n: Optional[int] = 1
     max_tokens: Optional[int] = None
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
-    presence_penalty: Optional[float] = 0.0
-    frequency_penalty: Optional[float] = 0.0
-    logit_bias: Optional[Dict[str, float]] = None
     user: Optional[str] = None
-    # Additional parameters supported by vLLM
     best_of: Optional[int] = None
     top_k: Optional[int] = -1
     ignore_eos: Optional[bool] = False
-    use_beam_search: Optional[bool] = False
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
     skip_special_tokens: Optional[bool] = True
     spaces_between_special_tokens: Optional[bool] = True
-    add_generation_prompt: Optional[bool] = True
-    echo: Optional[bool] = False
-    repetition_penalty: Optional[float] = 1.0
-    min_p: Optional[float] = 0.0
     def to_sampling_params(self) -> SamplingParams:
         return SamplingParams(
             n=self.n,
             presence_penalty=self.presence_penalty,
@@ -88,49 +250,173 @@ class ChatCompletionRequest(BaseModel):
             temperature=self.temperature,
             top_p=self.top_p,
             min_p=self.min_p,
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
             max_tokens=self.max_tokens,
             best_of=self.best_of,
             top_k=self.top_k,
             ignore_eos=self.ignore_eos,
             use_beam_search=self.use_beam_search,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
         )
-class CompletionRequest(BaseModel):
     model: str
-    # a string, array of strings, array of tokens, or array of token arrays
     prompt: Union[List[int], List[List[int]], str, List[str]]
-    suffix: Optional[str] = None
-    max_tokens: Optional[int] = 16
-    temperature: Optional[float] = 1.0
-    top_p: Optional[float] = 1.0
-    n: Optional[int] = 1
-    stream: Optional[bool] = False
-    logprobs: Optional[int] = None
     echo: Optional[bool] = False
-    stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
-    presence_penalty: Optional[float] = 0.0
     frequency_penalty: Optional[float] = 0.0
-    best_of: Optional[int] = None
     logit_bias: Optional[Dict[str, float]] = None
     user: Optional[str] = None
-    # Additional parameters supported by vLLM
-    top_k: Optional[int] = -1
-    ignore_eos: Optional[bool] = False
     use_beam_search: Optional[bool] = False
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
     skip_special_tokens: Optional[bool] = True
     spaces_between_special_tokens: Optional[bool] = True
-    repetition_penalty: Optional[float] = 1.0
-    min_p: Optional[float] = 0.0
     def to_sampling_params(self):
         echo_without_generation = self.echo and self.max_tokens == 0
         return SamplingParams(
             n=self.n,
             best_of=self.best_of,
@@ -141,33 +427,88 @@ class CompletionRequest(BaseModel):
             top_p=self.top_p,
             top_k=self.top_k,
             min_p=self.min_p,
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
             ignore_eos=self.ignore_eos,
             max_tokens=self.max_tokens if not echo_without_generation else 1,
             logprobs=self.logprobs,
             use_beam_search=self.use_beam_search,
             prompt_logprobs=self.logprobs if self.echo else None,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=(self.spaces_between_special_tokens),
         )
-class LogProbs(BaseModel):
     text_offset: List[int] = Field(default_factory=list)
     token_logprobs: List[Optional[float]] = Field(default_factory=list)
     tokens: List[str] = Field(default_factory=list)
-    top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None
-class CompletionResponseChoice(BaseModel):
     index: int
     text: str
-    logprobs: Optional[LogProbs] = None
-    finish_reason: Optional[Literal["stop", "length"]] = None
-class CompletionResponse(BaseModel):
     id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
     object: str = "text_completion"
     created: int = Field(default_factory=lambda: int(time.time()))
@@ -176,14 +517,21 @@ class CompletionResponse(BaseModel):
     usage: UsageInfo
-class CompletionResponseStreamChoice(BaseModel):
     index: int
     text: str
-    logprobs: Optional[LogProbs] = None
-    finish_reason: Optional[Literal["stop", "length"]] = None
-class CompletionStreamResponse(BaseModel):
     id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
     object: str = "text_completion"
     created: int = Field(default_factory=lambda: int(time.time()))
@@ -192,41 +540,128 @@ class CompletionStreamResponse(BaseModel):
     usage: Optional[UsageInfo] = Field(default=None)
-class ChatMessage(BaseModel):
     role: str
     content: str
-class ChatCompletionResponseChoice(BaseModel):
     index: int
     message: ChatMessage
-    finish_reason: Optional[Literal["stop", "length"]] = None
-class ChatCompletionResponse(BaseModel):
     id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
-    object: str = "chat.completion"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[ChatCompletionResponseChoice]
     usage: UsageInfo
-class DeltaMessage(BaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
-class ChatCompletionResponseStreamChoice(BaseModel):
     index: int
     delta: DeltaMessage
-    finish_reason: Optional[Literal["stop", "length"]] = None
-class ChatCompletionStreamResponse(BaseModel):
     id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
-    object: str = "chat.completion.chunk"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[ChatCompletionResponseStreamChoice]
-    usage: Optional[UsageInfo] = Field(default=None)

 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
 import time
+from typing import Any, Dict, List, Literal, Optional, Union
+import openai.types.chat
+import torch
+from pydantic import BaseModel, ConfigDict, Field, model_validator
+# pydantic needs the TypedDict from typing_extensions
+from typing_extensions import Annotated, Required, TypedDict
+from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
+from vllm.utils import random_uuid
+class CustomChatCompletionContentPartParam(TypedDict, total=False):
+    __pydantic_config__ = ConfigDict(extra="allow")  # type: ignore
+    type: Required[str]
+    """The type of the content part."""
+ChatCompletionContentPartParam = Union[
+    openai.types.chat.ChatCompletionContentPartParam,
+    CustomChatCompletionContentPartParam]
+class CustomChatCompletionMessageParam(TypedDict, total=False):
+    """Enables custom roles in the Chat Completion API."""
+    role: Required[str]
+    """The role of the message's author."""
+    content: Union[str, List[ChatCompletionContentPartParam]]
+    """The contents of the message."""
+    name: str
+    """An optional name for the participant.
+    Provides the model information to differentiate between participants of the
+    same role.
+    """
+ChatCompletionMessageParam = Union[
+    openai.types.chat.ChatCompletionMessageParam,
+    CustomChatCompletionMessageParam]
+class OpenAIBaseModel(BaseModel):
+    # OpenAI API does not allow extra fields
+    model_config = ConfigDict(extra="forbid")
+class ErrorResponse(OpenAIBaseModel):
     object: str = "error"
     message: str
     type: str
     code: int
+class ModelPermission(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
     object: str = "model_permission"
     created: int = Field(default_factory=lambda: int(time.time()))
     allow_fine_tuning: bool = False
     organization: str = "*"
     group: Optional[str] = None
+    is_blocking: bool = False
+class ModelCard(OpenAIBaseModel):
     id: str
     object: str = "model"
     created: int = Field(default_factory=lambda: int(time.time()))
     owned_by: str = "vllm"
     root: Optional[str] = None
     parent: Optional[str] = None
+    max_model_len: Optional[int] = None
     permission: List[ModelPermission] = Field(default_factory=list)
+class ModelList(OpenAIBaseModel):
     object: str = "list"
     data: List[ModelCard] = Field(default_factory=list)
+class UsageInfo(OpenAIBaseModel):
     prompt_tokens: int = 0
     total_tokens: int = 0
     completion_tokens: Optional[int] = 0
+class ResponseFormat(OpenAIBaseModel):
+    # type must be "json_object" or "text"
+    type: Literal["text", "json_object"]
+class FunctionDefinition(OpenAIBaseModel):
+    name: str
+    description: Optional[str] = None
+    parameters: Optional[Dict[str, Any]] = None
+class ChatCompletionToolsParam(OpenAIBaseModel):
+    type: Literal["function"] = "function"
+    function: FunctionDefinition
+class ChatCompletionNamedFunction(OpenAIBaseModel):
+    name: str
+class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
+    function: ChatCompletionNamedFunction
+    type: Literal["function"] = "function"
+class ChatCompletionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/chat/create
+    messages: List[ChatCompletionMessageParam]
     model: str
+    frequency_penalty: Optional[float] = 0.0
+    logit_bias: Optional[Dict[str, float]] = None
+    logprobs: Optional[bool] = False
+    top_logprobs: Optional[int] = 0
     max_tokens: Optional[int] = None
+    n: Optional[int] = 1
+    presence_penalty: Optional[float] = 0.0
+    response_format: Optional[ResponseFormat] = None
+    seed: Optional[int] = Field(None,
+                                ge=torch.iinfo(torch.long).min,
+                                le=torch.iinfo(torch.long).max)
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 1.0
+    tools: Optional[List[ChatCompletionToolsParam]] = None
+    tool_choice: Optional[Union[Literal["none"],
+                                ChatCompletionNamedToolChoiceParam]] = "none"
     user: Optional[str] = None
+    # doc: begin-chat-completion-sampling-params
     best_of: Optional[int] = None
+    use_beam_search: Optional[bool] = False
     top_k: Optional[int] = -1
+    min_p: Optional[float] = 0.0
+    repetition_penalty: Optional[float] = 1.0
+    length_penalty: Optional[float] = 1.0
+    early_stopping: Optional[bool] = False
     ignore_eos: Optional[bool] = False
+    min_tokens: Optional[int] = 0
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
     skip_special_tokens: Optional[bool] = True
     spaces_between_special_tokens: Optional[bool] = True
+    # doc: end-chat-completion-sampling-params
+    # doc: begin-chat-completion-extra-params
+    echo: Optional[bool] = Field(
+        default=False,
+        description=(
+            "If true, the new message will be prepended with the last message "
+            "if they belong to the same role."),
+    )
+    add_generation_prompt: Optional[bool] = Field(
+        default=True,
+        description=
+        ("If true, the generation prompt will be added to the chat template. "
+         "This is a parameter used by chat template in tokenizer config of the "
+         "model."),
+    )
+    add_special_tokens: Optional[bool] = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to False (as is the "
+            "default)."),
+    )
+    include_stop_str_in_output: Optional[bool] = Field(
+        default=False,
+        description=(
+            "Whether to include the stop string in the output. "
+            "This is only applied when the stop or stop_token_ids is set."),
+    )
+    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+        default=None,
+        description=("If specified, the output will follow the JSON schema."),
+    )
+    guided_regex: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the output will follow the regex pattern."),
+    )
+    guided_choice: Optional[List[str]] = Field(
+        default=None,
+        description=(
+            "If specified, the output will be exactly one of the choices."),
+    )
+    guided_grammar: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the output will follow the context free grammar."),
+    )
+    guided_decoding_backend: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, will override the default guided decoding backend "
+            "of the server for this specific request. If set, must be either "
+            "'outlines' / 'lm-format-enforcer'"))
+    guided_whitespace_pattern: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, will override the default whitespace pattern "
+            "for guided json decoding."))
+    # doc: end-chat-completion-extra-params
     def to_sampling_params(self) -> SamplingParams:
+        # We now allow logprobs being true without top_logrobs.
+        logits_processors = None
+        if self.logit_bias:
+            def logit_bias_logits_processor(
+                    token_ids: List[int],
+                    logits: torch.Tensor) -> torch.Tensor:
+                assert self.logit_bias is not None
+                for token_id, bias in self.logit_bias.items():
+                    # Clamp the bias between -100 and 100 per OpenAI API spec
+                    bias = min(100, max(-100, bias))
+                    logits[int(token_id)] += bias
+                return logits
+            logits_processors = [logit_bias_logits_processor]
         return SamplingParams(
             n=self.n,
             presence_penalty=self.presence_penalty,
             temperature=self.temperature,
             top_p=self.top_p,
             min_p=self.min_p,
+            seed=self.seed,
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
             max_tokens=self.max_tokens,
+            min_tokens=self.min_tokens,
+            logprobs=self.top_logprobs if self.logprobs else None,
+            prompt_logprobs=self.top_logprobs if self.echo else None,
             best_of=self.best_of,
             top_k=self.top_k,
             ignore_eos=self.ignore_eos,
             use_beam_search=self.use_beam_search,
+            early_stopping=self.early_stopping,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=self.spaces_between_special_tokens,
+            include_stop_str_in_output=self.include_stop_str_in_output,
+            length_penalty=self.length_penalty,
+            logits_processors=logits_processors,
         )
+    @model_validator(mode="before")
+    @classmethod
+    def check_guided_decoding_count(cls, data):
+        guide_count = sum([
+            "guided_json" in data and data["guided_json"] is not None,
+            "guided_regex" in data and data["guided_regex"] is not None,
+            "guided_choice" in data and data["guided_choice"] is not None
+        ])
+        # you can only use one kind of guided decoding
+        if guide_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding "
+                "('guided_json', 'guided_regex' or 'guided_choice').")
+        # you can only either use guided decoding or tools, not both
+        if guide_count > 1 and "tool_choice" in data and data[
+                "tool_choice"] != "none":
+            raise ValueError(
+                "You can only either use guided decoding or tools, not both.")
+        return data
+    @model_validator(mode="before")
+    @classmethod
+    def check_tool_choice(cls, data):
+        if "tool_choice" in data and data["tool_choice"] != "none":
+            if not isinstance(data["tool_choice"], dict):
+                raise ValueError("Currently only named tools are supported.")
+            if "tools" not in data or data["tools"] is None:
+                raise ValueError(
+                    "When using `tool_choice`, `tools` must be set.")
+        return data
+    @model_validator(mode="before")
+    @classmethod
+    def check_logprobs(cls, data):
+        if "top_logprobs" in data and data["top_logprobs"] is not None:
+            if "logprobs" not in data or data["logprobs"] is False:
+                raise ValueError(
+                    "when using `top_logprobs`, `logprobs` must be set to true."
+                )
+            elif not 0 <= data["top_logprobs"] <= 20:
+                raise ValueError(
+                    "`top_logprobs` must be a value in the interval [0, 20].")
+        return data
+class CompletionRequest(OpenAIBaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/completions/create
     model: str
     prompt: Union[List[int], List[List[int]], str, List[str]]
+    best_of: Optional[int] = None
     echo: Optional[bool] = False
     frequency_penalty: Optional[float] = 0.0
     logit_bias: Optional[Dict[str, float]] = None
+    logprobs: Optional[int] = None
+    max_tokens: Optional[int] = 16
+    n: int = 1
+    presence_penalty: Optional[float] = 0.0
+    seed: Optional[int] = Field(None,
+                                ge=torch.iinfo(torch.long).min,
+                                le=torch.iinfo(torch.long).max)
+    stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
+    stream: Optional[bool] = False
+    suffix: Optional[str] = None
+    temperature: Optional[float] = 1.0
+    top_p: Optional[float] = 1.0
     user: Optional[str] = None
+    # doc: begin-completion-sampling-params
     use_beam_search: Optional[bool] = False
+    top_k: Optional[int] = -1
+    min_p: Optional[float] = 0.0
+    repetition_penalty: Optional[float] = 1.0
+    length_penalty: Optional[float] = 1.0
+    early_stopping: Optional[bool] = False
     stop_token_ids: Optional[List[int]] = Field(default_factory=list)
+    ignore_eos: Optional[bool] = False
+    min_tokens: Optional[int] = 0
     skip_special_tokens: Optional[bool] = True
     spaces_between_special_tokens: Optional[bool] = True
+    truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
+    # doc: end-completion-sampling-params
+    # doc: begin-completion-extra-params
+    include_stop_str_in_output: Optional[bool] = Field(
+        default=False,
+        description=(
+            "Whether to include the stop string in the output. "
+            "This is only applied when the stop or stop_token_ids is set."),
+    )
+    response_format: Optional[ResponseFormat] = Field(
+        default=None,
+        description=
+        ("Similar to chat completion, this parameter specifies the format of "
+         "output. Only {'type': 'json_object'} or {'type': 'text' } is "
+         "supported."),
+    )
+    guided_json: Optional[Union[str, dict, BaseModel]] = Field(
+        default=None,
+        description=("If specified, the output will follow the JSON schema."),
+    )
+    guided_regex: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the output will follow the regex pattern."),
+    )
+    guided_choice: Optional[List[str]] = Field(
+        default=None,
+        description=(
+            "If specified, the output will be exactly one of the choices."),
+    )
+    guided_grammar: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, the output will follow the context free grammar."),
+    )
+    guided_decoding_backend: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, will override the default guided decoding backend "
+            "of the server for this specific request. If set, must be one of "
+            "'outlines' / 'lm-format-enforcer'"))
+    guided_whitespace_pattern: Optional[str] = Field(
+        default=None,
+        description=(
+            "If specified, will override the default whitespace pattern "
+            "for guided json decoding."))
+    # doc: end-completion-extra-params
     def to_sampling_params(self):
         echo_without_generation = self.echo and self.max_tokens == 0
+        logits_processors = None
+        if self.logit_bias:
+            def logit_bias_logits_processor(
+                    token_ids: List[int],
+                    logits: torch.Tensor) -> torch.Tensor:
+                assert self.logit_bias is not None
+                for token_id, bias in self.logit_bias.items():
+                    # Clamp the bias between -100 and 100 per OpenAI API spec
+                    bias = min(100, max(-100, bias))
+                    logits[int(token_id)] += bias
+                return logits
+            logits_processors = [logit_bias_logits_processor]
         return SamplingParams(
             n=self.n,
             best_of=self.best_of,
             top_p=self.top_p,
             top_k=self.top_k,
             min_p=self.min_p,
+            seed=self.seed,
             stop=self.stop,
             stop_token_ids=self.stop_token_ids,
             ignore_eos=self.ignore_eos,
             max_tokens=self.max_tokens if not echo_without_generation else 1,
+            min_tokens=self.min_tokens,
             logprobs=self.logprobs,
             use_beam_search=self.use_beam_search,
+            early_stopping=self.early_stopping,
             prompt_logprobs=self.logprobs if self.echo else None,
             skip_special_tokens=self.skip_special_tokens,
             spaces_between_special_tokens=(self.spaces_between_special_tokens),
+            include_stop_str_in_output=self.include_stop_str_in_output,
+            length_penalty=self.length_penalty,
+            logits_processors=logits_processors,
+            truncate_prompt_tokens=self.truncate_prompt_tokens,
         )
+    @model_validator(mode="before")
+    @classmethod
+    def check_guided_decoding_count(cls, data):
+        guide_count = sum([
+            "guided_json" in data and data["guided_json"] is not None,
+            "guided_regex" in data and data["guided_regex"] is not None,
+            "guided_choice" in data and data["guided_choice"] is not None
+        ])
+        if guide_count > 1:
+            raise ValueError(
+                "You can only use one kind of guided decoding "
+                "('guided_json', 'guided_regex' or 'guided_choice').")
+        return data
+    @model_validator(mode="before")
+    @classmethod
+    def check_logprobs(cls, data):
+        if "logprobs" in data and data[
+                "logprobs"] is not None and not 0 <= data["logprobs"] <= 5:
+            raise ValueError(("if passed, `logprobs` must be a value",
+                              " in the interval [0, 5]."))
+        return data
+class EmbeddingRequest(BaseModel):
+    # Ordered by official OpenAI API documentation
+    # https://platform.openai.com/docs/api-reference/embeddings
+    model: str
+    input: Union[List[int], List[List[int]], str, List[str]]
+    encoding_format: Optional[str] = Field('float', pattern='^(float|base64)$')
+    dimensions: Optional[int] = None
+    user: Optional[str] = None
+    # doc: begin-embedding-pooling-params
+    additional_data: Optional[Any] = None
+    # doc: end-embedding-pooling-params
+    def to_pooling_params(self):
+        return PoolingParams(additional_data=self.additional_data)
+class CompletionLogProbs(OpenAIBaseModel):
     text_offset: List[int] = Field(default_factory=list)
     token_logprobs: List[Optional[float]] = Field(default_factory=list)
     tokens: List[str] = Field(default_factory=list)
+    top_logprobs: Optional[List[Optional[Dict[str, float]]]] = None
+class CompletionResponseChoice(OpenAIBaseModel):
     index: int
     text: str
+    logprobs: Optional[CompletionLogProbs] = None
+    finish_reason: Optional[str] = None
+    stop_reason: Optional[Union[int, str]] = Field(
+        default=None,
+        description=(
+            "The stop string or token id that caused the completion "
+            "to stop, None if the completion finished for some other reason "
+            "including encountering the EOS token"),
+    )
+class CompletionResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
     object: str = "text_completion"
     created: int = Field(default_factory=lambda: int(time.time()))
     usage: UsageInfo
+class CompletionResponseStreamChoice(OpenAIBaseModel):
     index: int
     text: str
+    logprobs: Optional[CompletionLogProbs] = None
+    finish_reason: Optional[str] = None
+    stop_reason: Optional[Union[int, str]] = Field(
+        default=None,
+        description=(
+            "The stop string or token id that caused the completion "
+            "to stop, None if the completion finished for some other reason "
+            "including encountering the EOS token"),
+    )
+class CompletionStreamResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
     object: str = "text_completion"
     created: int = Field(default_factory=lambda: int(time.time()))
     usage: Optional[UsageInfo] = Field(default=None)
+class EmbeddingResponseData(BaseModel):
+    index: int
+    object: str = "embedding"
+    embedding: List[float]
+class EmbeddingResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
+    object: str = "list"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    data: List[EmbeddingResponseData]
+    usage: UsageInfo
+class FunctionCall(OpenAIBaseModel):
+    name: str
+    arguments: str
+class ToolCall(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}")
+    type: Literal["function"] = "function"
+    function: FunctionCall
+class ChatMessage(OpenAIBaseModel):
     role: str
     content: str
+    tool_calls: List[ToolCall] = Field(default_factory=list)
+class ChatCompletionLogProb(OpenAIBaseModel):
+    token: str
+    logprob: float = -9999.0
+    bytes: Optional[List[int]] = None
+class ChatCompletionLogProbsContent(ChatCompletionLogProb):
+    top_logprobs: List[ChatCompletionLogProb] = Field(default_factory=list)
+class ChatCompletionLogProbs(OpenAIBaseModel):
+    content: Optional[List[ChatCompletionLogProbsContent]] = None
+class ChatCompletionResponseChoice(OpenAIBaseModel):
     index: int
     message: ChatMessage
+    logprobs: Optional[ChatCompletionLogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
+    stop_reason: Optional[Union[int, str]] = None
+class ChatCompletionResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: Literal["chat.completion"] = "chat.completion"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[ChatCompletionResponseChoice]
     usage: UsageInfo
+class DeltaMessage(OpenAIBaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
+    tool_calls: List[ToolCall] = Field(default_factory=list)
+class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
     index: int
     delta: DeltaMessage
+    logprobs: Optional[ChatCompletionLogProbs] = None
+    finish_reason: Optional[Literal["stop", "length", "tool_calls"]] = None
+    stop_reason: Optional[Union[int, str]] = None
+class ChatCompletionStreamResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[ChatCompletionResponseStreamChoice]
+    usage: Optional[UsageInfo] = Field(default=None)
+class BatchRequestInput(OpenAIBaseModel):
+    """
+    The per-line object of the batch input file.
+    NOTE: Currently only the `/v1/chat/completions` endpoint is supported.
+    """
+    # A developer-provided per-request id that will be used to match outputs to
+    # inputs. Must be unique for each request in a batch.
+    custom_id: str
+    # The HTTP method to be used for the request. Currently only POST is
+    # supported.
+    method: str
+    # The OpenAI API relative URL to be used for the request. Currently
+    # /v1/chat/completions is supported.
+    url: str
+    # The parameteters of the request.
+    body: Union[ChatCompletionRequest, ]
+class BatchRequestOutput(OpenAIBaseModel):
+    """
+    The per-line object of the batch output and error files
+    """
+    id: str
+    # A developer-provided per-request id that will be used to match outputs to
+    # inputs.
+    custom_id: str
+    response: Optional[ChatCompletionResponse]
+    # For requests that failed with a non-HTTP error, this will contain more
+    # information on the cause of the failure.
+    error: Optional[Any]

serving_completion.py CHANGED Viewed

@@ -1,290 +1,495 @@
 import time
 from fastapi import Request
-from typing import AsyncGenerator, AsyncIterator
-from vllm.logger import init_logger
-from vllm.utils import random_uuid
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from protocol import (
-    CompletionRequest,
-    CompletionResponse,
-    CompletionResponseChoice,
-    CompletionResponseStreamChoice,
-    CompletionStreamResponse,
-    LogProbs,
-    UsageInfo,
-)
 from vllm.outputs import RequestOutput
-from serving_engine import OpenAIServing
 logger = init_logger(__name__)
-async def completion_stream_generator(
-        request: CompletionRequest,
-        result_generator: AsyncIterator[RequestOutput],
-        echo_without_generation, create_logprobs_fn, request_id, created_time,
-        model_name) -> AsyncGenerator[str, None]:
-    previous_texts = [""] * request.n
-    previous_num_tokens = [0] * request.n
-    has_echoed = [False] * request.n
-    async for res in result_generator:
-        # TODO: handle client disconnect for streaming
-        for output in res.outputs:
-            i = output.index
-            delta_text = output.text[len(previous_texts[i]):]
-            token_ids = output.token_ids[previous_num_tokens[i]:]
-            if request.logprobs is not None:
-                top_logprobs = output.logprobs[previous_num_tokens[i]:]
-            else:
-                top_logprobs = None
-            offsets = len(previous_texts[i])
-            if request.echo and not has_echoed[i]:
-                if not echo_without_generation:
-                    delta_text = res.prompt + delta_text
-                    token_ids = res.prompt_token_ids + token_ids
-                    if top_logprobs:
-                        top_logprobs = res.prompt_logprobs + top_logprobs
-                else:  # only just return the prompt
-                    delta_text = res.prompt
-                    token_ids = res.prompt_token_ids
-                    if top_logprobs:
-                        top_logprobs = res.prompt_logprobs
-                has_echoed[i] = True
-            if request.logprobs is not None:
-                logprobs = create_logprobs_fn(
-                    token_ids=token_ids,
-                    top_logprobs=top_logprobs,
-                    num_output_top_logprobs=request.logprobs,
-                    initial_text_offset=offsets,
-                )
-            else:
-                logprobs = None
-            previous_texts[i] = output.text
-            previous_num_tokens[i] = len(output.token_ids)
-            finish_reason = output.finish_reason
-            response_json = CompletionStreamResponse(
-                id=request_id,
-                created=created_time,
-                model=model_name,
-                choices=[
-                    CompletionResponseStreamChoice(
-                        index=i,
-                        text=delta_text,
-                        logprobs=logprobs,
-                        finish_reason=finish_reason,
-                    )
-                ]).model_dump_json(exclude_unset=True)
-            yield f"data: {response_json}\n\n"
-            if output.finish_reason is not None:
-                logprobs = LogProbs() if request.logprobs is not None else None
-                prompt_tokens = len(res.prompt_token_ids)
-                completion_tokens = len(output.token_ids)
-                final_usage = UsageInfo(
-                    prompt_tokens=prompt_tokens,
-                    completion_tokens=completion_tokens,
-                    total_tokens=prompt_tokens + completion_tokens,
-                )
-                response_json = CompletionStreamResponse(
-                    id=request_id,
-                    created=created_time,
-                    model=model_name,
-                    choices=[
-                        CompletionResponseStreamChoice(
-                            index=i,
-                            text="",
-                            logprobs=logprobs,
-                            finish_reason=output.finish_reason,
-                        )
-                    ],
-                    usage=final_usage,
-                ).model_dump_json(exclude_unset=True)
-                yield f"data: {response_json}\n\n"
-    yield "data: [DONE]\n\n"
-def parse_prompt_format(prompt) -> tuple[bool, list]:
-    # get the prompt, openai supports the following
-    # "a string, array of strings, array of tokens, or array of token arrays."
-    prompt_is_tokens = False
-    prompts = [prompt]  # case 1: a string
-    if isinstance(prompt, list):
-        if len(prompt) == 0:
-            raise ValueError("please provide at least one prompt")
-        elif isinstance(prompt[0], str):
-            prompt_is_tokens = False
-            prompts = prompt  # case 2: array of strings
-        elif isinstance(prompt[0], int):
-            prompt_is_tokens = True
-            prompts = [prompt]  # case 3: array of tokens
-        elif isinstance(prompt[0], list) and isinstance(prompt[0][0], int):
-            prompt_is_tokens = True
-            prompts = prompt  # case 4: array of token arrays
         else:
-            raise ValueError(
-                "prompt must be a string, array of strings, array of tokens, or array of token arrays"
-            )
-    return prompt_is_tokens, prompts
-def request_output_to_completion_response(final_res: RequestOutput, request,
-                                          echo_without_generation,
-                                          create_logprobs_fn, request_id,
-                                          created_time,
-                                          model_name) -> CompletionResponse:
-    assert final_res is not None
-    choices = []
-    prompt_token_ids = final_res.prompt_token_ids
-    prompt_logprobs = final_res.prompt_logprobs
-    prompt_text = final_res.prompt
-    for output in final_res.outputs:
-        if request.logprobs is not None:
-            if not echo_without_generation:
-                token_ids = output.token_ids
-                top_logprobs = output.logprobs
-                if request.echo:
-                    token_ids = prompt_token_ids + token_ids
-                    top_logprobs = prompt_logprobs + top_logprobs
             else:
-                token_ids = prompt_token_ids
-                top_logprobs = prompt_logprobs
-            logprobs = create_logprobs_fn(
-                token_ids=token_ids,
-                top_logprobs=top_logprobs,
-                num_output_top_logprobs=request.logprobs,
-            )
-        else:
-            logprobs = None
-        if not echo_without_generation:
-            output_text = output.text
-            if request.echo:
-                output_text = prompt_text + output_text
-        else:
-            output_text = prompt_text
-        choice_data = CompletionResponseChoice(
-            index=output.index,
-            text=output_text,
-            logprobs=logprobs,
-            finish_reason=output.finish_reason,
-        )
-        choices.append(choice_data)
-    num_prompt_tokens = len(final_res.prompt_token_ids)
-    num_generated_tokens = sum(
-        len(output.token_ids) for output in final_res.outputs)
-    usage = UsageInfo(
-        prompt_tokens=num_prompt_tokens,
-        completion_tokens=num_generated_tokens,
-        total_tokens=num_prompt_tokens + num_generated_tokens,
-    )
-    return CompletionResponse(
-        id=request_id,
-        created=created_time,
-        model=model_name,
-        choices=choices,
-        usage=usage,
-    )
-class OpenAIServingCompletion(OpenAIServing):
-    def __init__(self, engine: AsyncLLMEngine, served_model: str):
-        super().__init__(engine=engine, served_model=served_model)
-    async def create_completion(self, request: CompletionRequest,
-                                raw_request: Request):
         """Completion API similar to OpenAI's API.
-        See https://platform.openai.com/docs/api-reference/completions/create
-        for the API specification. This API mimics the OpenAI Completion API.
-        NOTE: Currently we do not support the following features:
-            - suffix (the language models we currently support do not support
-            suffix)
-            - logit_bias (to be supported by vLLM engine)
         """
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
-        # OpenAI API supports echoing the prompt when max_tokens is 0.
-        echo_without_generation = request.echo and request.max_tokens == 0
-        # Return error for unsupported features.
-        if request.suffix is not None:
-            return self.create_error_response(
-                "suffix is not currently supported")
-        if request.logit_bias is not None and len(request.logit_bias) > 0:
-            return self.create_error_response(
-                "logit_bias is not currently supported")
-        model_name = request.model
-        request_id = f"cmpl-{random_uuid()}"
-        created_time = int(time.monotonic())
-        # Schedule the request and get the result generator.
         try:
-            sampling_params = request.to_sampling_params()
-            prompt_is_tokens, prompts = parse_prompt_format(request.prompt)
-            if len(prompts) > 1:
-                raise ValueError(
-                    "Batching in completion API is not supported.")
-            prompt = prompts[0]
-            if prompt_is_tokens:
-                input_ids = self._validate_prompt_and_tokenize(
-                    request, prompt_ids=prompt)
-            else:
-                input_ids = self._validate_prompt_and_tokenize(request,
-                                                               prompt=prompt)
-            result_generator = self.engine.generate(None,
-                                                    sampling_params,
-                                                    request_id,
-                                                    prompt_token_ids=input_ids)
         except ValueError as e:
             return self.create_error_response(str(e))
-        # Similar to the OpenAI API, when n != best_of, we do not stream the
-        # results. In addition, we do not stream the results when use beam search.
-        stream = (request.stream
-                  and (request.best_of is None or request.n == request.best_of)
-                  and not request.use_beam_search)
         # Streaming response
-        if stream:
-            return completion_stream_generator(request, result_generator,
-                                               echo_without_generation,
-                                               self._create_logprobs,
-                                               request_id, created_time,
-                                               model_name)
-        # Non-streaming response
-        final_res: RequestOutput = None
         async for res in result_generator:
-            if await raw_request.is_disconnected():
                 # Abort the request if the client disconnects.
                 await self.engine.abort(request_id)
                 return self.create_error_response("Client disconnected")
             final_res = res
-        response = request_output_to_completion_response(
-            final_res, request, echo_without_generation, self._create_logprobs,
-            request_id, created_time, model_name)
-        # When user requests streaming but we don't stream, we still need to
-        # return a streaming response with a single event.
-        if request.stream:
-            response_json = response.model_dump_json()
-            async def fake_stream_generator() -> AsyncGenerator[str, None]:
-                yield f"data: {response_json}\n\n"
-                yield "data: [DONE]\n\n"
-            return fake_stream_generator()
-        return response

+import codecs
 import time
+from dataclasses import dataclass
+from typing import (AsyncGenerator, AsyncIterator, Dict, Iterable, List,
+                    Optional)
+from typing import Sequence as GenericSequence
+from typing import TypedDict, Union, cast, final
 from fastapi import Request
+from openai.types.chat import ChatCompletionContentPartTextParam
+from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionContentPartParam, ChatCompletionLogProb,
+    ChatCompletionLogProbs, ChatCompletionLogProbsContent,
+    ChatCompletionMessageParam, ChatCompletionNamedToolChoiceParam,
+    ChatCompletionRequest, ChatCompletionResponse,
+    ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
+    FunctionCall, ToolCall, UsageInfo)
+from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
+                                                    OpenAIServing)
+from vllm.logger import init_logger
+from vllm.model_executor.guided_decoding import (
+    get_guided_decoding_logits_processor)
 from vllm.outputs import RequestOutput
+from vllm.sequence import Logprob
+from vllm.utils import random_uuid
 logger = init_logger(__name__)
+@final  # So that it should be compatible with Dict[str, str]
+class ConversationMessage(TypedDict):
+    role: str
+    content: str
+@dataclass(frozen=True)
+class ChatMessageParseResult:
+    messages: List[ConversationMessage]
+class OpenAIServingChat(OpenAIServing):
+    def __init__(self,
+                 engine: AsyncLLMEngine,
+                 model_config: ModelConfig,
+                 served_model_names: List[str],
+                 response_role: str,
+                 lora_modules: Optional[List[LoRAModulePath]] = None,
+                 chat_template: Optional[str] = None):
+        super().__init__(engine=engine,
+                         model_config=model_config,
+                         served_model_names=served_model_names,
+                         lora_modules=lora_modules)
+        self.response_role = response_role
+        self._load_chat_template(chat_template)
+    def _load_chat_template(self, chat_template: Optional[str]):
+        tokenizer = self.tokenizer
+        if chat_template is not None:
+            try:
+                with open(chat_template, "r") as f:
+                    tokenizer.chat_template = f.read()
+            except OSError as e:
+                JINJA_CHARS = "{}\n"
+                if not any(c in chat_template for c in JINJA_CHARS):
+                    msg = (f"The supplied chat template ({chat_template}) "
+                           f"looks like a file path, but it failed to be "
+                           f"opened. Reason: {e}")
+                    raise ValueError(msg) from e
+                # If opening a file fails, set chat template to be args to
+                # ensure we decode so our escape are interpreted correctly
+                tokenizer.chat_template = codecs.decode(
+                    chat_template, "unicode_escape")
+            logger.info("Using supplied chat template:\n%s",
+                        tokenizer.chat_template)
+        elif tokenizer.chat_template is not None:
+            logger.info("Using default chat template:\n%s",
+                        tokenizer.chat_template)
         else:
+            logger.warning(
+                "No chat template provided. Chat API will not work.")
+    def _parse_chat_message_content_parts(
+        self,
+        role: str,
+        parts: Iterable[ChatCompletionContentPartParam],
+    ) -> ChatMessageParseResult:
+        texts: List[str] = []
+        for _, part in enumerate(parts):
+            part_type = part["type"]
+            if part_type == "text":
+                text = cast(ChatCompletionContentPartTextParam, part)["text"]
+                texts.append(text)
             else:
+                raise NotImplementedError(f"Unknown part type: {part_type}")
+        messages = [ConversationMessage(role=role, content="\n".join(texts))]
+        return ChatMessageParseResult(messages=messages)
+    def _parse_chat_message_content(
+        self,
+        message: ChatCompletionMessageParam,
+    ) -> ChatMessageParseResult:
+        role = message["role"]
+        content = message.get("content")
+        if content is None:
+            return ChatMessageParseResult(messages=[])
+        if isinstance(content, str):
+            messages = [ConversationMessage(role=role, content=content)]
+            return ChatMessageParseResult(messages=messages)
+        return self._parse_chat_message_content_parts(role, content)
+    async def create_chat_completion(
+        self,
+        request: ChatCompletionRequest,
+        raw_request: Optional[Request] = None
+    ) -> Union[ErrorResponse, AsyncGenerator[str, None],
+               ChatCompletionResponse]:
         """Completion API similar to OpenAI's API.
+        See https://platform.openai.com/docs/api-reference/chat/create
+        for the API specification. This API mimics the OpenAI
+        ChatCompletion API.
+        NOTE: Currently we do not support the following feature:
+            - function_call (Users should implement this by themselves)
         """
         error_check_ret = await self._check_model(request)
         if error_check_ret is not None:
             return error_check_ret
         try:
+            conversation: List[ConversationMessage] = []
+            for msg in request.messages:
+                parsed_msg = self._parse_chat_message_content(msg)
+                conversation.extend(parsed_msg.messages)
+            prompt = self.tokenizer.apply_chat_template(
+                conversation=conversation,
+                tokenize=False,
+                add_generation_prompt=request.add_generation_prompt,
+            )
+        except Exception as e:
+            logger.error("Error in applying chat template from request: %s", e)
+            return self.create_error_response(str(e))
+        request_id = f"cmpl-{random_uuid()}"
+        try:
+            # Tokenize/detokenize depending on prompt format (string/token list)
+            prompt_ids, prompt_text = self._validate_prompt_and_tokenize(
+                request,
+                prompt=prompt,
+                add_special_tokens=request.add_special_tokens)
+            sampling_params = request.to_sampling_params()
+            lora_request = self._maybe_get_lora(request)
+            decoding_config = await self.engine.get_decoding_config()
+            guided_decoding_backend = request.guided_decoding_backend \
+                or decoding_config.guided_decoding_backend
+            guided_decode_logits_processor = (
+                await get_guided_decoding_logits_processor(
+                    guided_decoding_backend, request, await
+                    self.engine.get_tokenizer()))
+            if guided_decode_logits_processor:
+                if sampling_params.logits_processors is None:
+                    sampling_params.logits_processors = []
+                sampling_params.logits_processors.append(
+                    guided_decode_logits_processor)
         except ValueError as e:
             return self.create_error_response(str(e))
+        result_generator = self.engine.generate(
+            {
+                "prompt": prompt_text,
+                "prompt_token_ids": prompt_ids
+            },
+            sampling_params,
+            request_id,
+            lora_request,
+        )
         # Streaming response
+        if request.stream:
+            return self.chat_completion_stream_generator(
+                request, result_generator, request_id, conversation)
+        else:
+            try:
+                return await self.chat_completion_full_generator(
+                    request, raw_request, result_generator, request_id,
+                    conversation)
+            except ValueError as e:
+                # TODO: Use a vllm-specific Validation Error
+                return self.create_error_response(str(e))
+    def get_chat_request_role(self, request: ChatCompletionRequest) -> str:
+        if request.add_generation_prompt:
+            return self.response_role
+        else:
+            return request.messages[-1]["role"]
+    async def chat_completion_stream_generator(
+            self, request: ChatCompletionRequest,
+            result_generator: AsyncIterator[RequestOutput], request_id: str,
+            conversation: List[ConversationMessage]
+    ) -> AsyncGenerator[str, None]:
+        model_name = self.served_model_names[0]
+        created_time = int(time.time())
+        chunk_object_type = "chat.completion.chunk"
+        first_iteration = True
+        # Send response for each token for each request.n (index)
+        assert request.n is not None
+        previous_texts = [""] * request.n
+        previous_num_tokens = [0] * request.n
+        finish_reason_sent = [False] * request.n
+        try:
+            async for res in result_generator:
+                # We need to do it here, because if there are exceptions in
+                # the result_generator, it needs to be sent as the FIRST
+                # response (by the try...catch).
+                if first_iteration:
+                    # Send first response for each request.n (index) with
+                    # the role
+                    role = self.get_chat_request_role(request)
+                    for i in range(request.n):
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=i,
+                            delta=DeltaMessage(role=role),
+                            logprobs=None,
+                            finish_reason=None)
+                        chunk = ChatCompletionStreamResponse(
+                            id=request_id,
+                            object=chunk_object_type,
+                            created=created_time,
+                            choices=[choice_data],
+                            model=model_name)
+                        data = chunk.model_dump_json(exclude_unset=True)
+                        yield f"data: {data}\n\n"
+                    # Send response to echo the input portion of the
+                    # last message
+                    if request.echo:
+                        last_msg_content = ""
+                        if conversation and conversation[-1].get(
+                                "content") and conversation[-1].get(
+                                    "role") == role:
+                            last_msg_content = conversation[-1]["content"]
+                        if last_msg_content:
+                            for i in range(request.n):
+                                choice_data = (
+                                    ChatCompletionResponseStreamChoice(
+                                        index=i,
+                                        delta=DeltaMessage(
+                                            content=last_msg_content),
+                                        finish_reason=None))
+                                chunk = ChatCompletionStreamResponse(
+                                    id=request_id,
+                                    object=chunk_object_type,
+                                    created=created_time,
+                                    choices=[choice_data],
+                                    logprobs=None,
+                                    model=model_name)
+                                data = chunk.model_dump_json(
+                                    exclude_unset=True)
+                                yield f"data: {data}\n\n"
+                    first_iteration = False
+                for output in res.outputs:
+                    i = output.index
+                    if finish_reason_sent[i]:
+                        continue
+                    delta_token_ids = output.token_ids[previous_num_tokens[i]:]
+                    top_logprobs = output.logprobs[
+                        previous_num_tokens[i]:] if output.logprobs else None
+                    if request.logprobs:
+                        logprobs = self._create_chat_logprobs(
+                            token_ids=delta_token_ids,
+                            top_logprobs=top_logprobs,
+                            num_output_top_logprobs=request.top_logprobs,
+                        )
+                    else:
+                        logprobs = None
+                    delta_text = output.text[len(previous_texts[i]):]
+                    previous_texts[i] = output.text
+                    previous_num_tokens[i] = len(output.token_ids)
+                    if request.tool_choice and type(
+                            request.tool_choice
+                    ) is ChatCompletionNamedToolChoiceParam:
+                        delta_message = DeltaMessage(tool_calls=[
+                            ToolCall(function=FunctionCall(
+                                name=request.tool_choice.function.name,
+                                arguments=delta_text))
+                        ])
+                    else:
+                        delta_message = DeltaMessage(content=delta_text)
+                    if output.finish_reason is None:
+                        # Send token-by-token response for each request.n
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=i,
+                            delta=delta_message,
+                            logprobs=logprobs,
+                            finish_reason=None)
+                        chunk = ChatCompletionStreamResponse(
+                            id=request_id,
+                            object=chunk_object_type,
+                            created=created_time,
+                            choices=[choice_data],
+                            model=model_name)
+                        data = chunk.model_dump_json(exclude_unset=True)
+                        yield f"data: {data}\n\n"
+                    else:
+                        # Send the finish response for each request.n only once
+                        prompt_tokens = len(res.prompt_token_ids)
+                        final_usage = UsageInfo(
+                            prompt_tokens=prompt_tokens,
+                            completion_tokens=previous_num_tokens[i],
+                            total_tokens=prompt_tokens +
+                            previous_num_tokens[i],
+                        )
+                        choice_data = ChatCompletionResponseStreamChoice(
+                            index=i,
+                            delta=delta_message,
+                            logprobs=logprobs,
+                            finish_reason=output.finish_reason,
+                            stop_reason=output.stop_reason)
+                        chunk = ChatCompletionStreamResponse(
+                            id=request_id,
+                            object=chunk_object_type,
+                            created=created_time,
+                            choices=[choice_data],
+                            model=model_name)
+                        if final_usage is not None:
+                            chunk.usage = final_usage
+                        data = chunk.model_dump_json(exclude_unset=True,
+                                                     exclude_none=True)
+                        yield f"data: {data}\n\n"
+                        finish_reason_sent[i] = True
+        except ValueError as e:
+            # TODO: Use a vllm-specific Validation Error
+            data = self.create_streaming_error_response(str(e))
+            yield f"data: {data}\n\n"
+        # Send the final done message after all response.n are finished
+        yield "data: [DONE]\n\n"
+    async def chat_completion_full_generator(
+        self, request: ChatCompletionRequest, raw_request: Optional[Request],
+        result_generator: AsyncIterator[RequestOutput], request_id: str,
+        conversation: List[ConversationMessage]
+    ) -> Union[ErrorResponse, ChatCompletionResponse]:
+        model_name = self.served_model_names[0]
+        created_time = int(time.time())
+        final_res: Optional[RequestOutput] = None
         async for res in result_generator:
+            if raw_request is not None and await raw_request.is_disconnected():
                 # Abort the request if the client disconnects.
                 await self.engine.abort(request_id)
                 return self.create_error_response("Client disconnected")
             final_res = res
+        assert final_res is not None
+        choices = []
+        role = self.get_chat_request_role(request)
+        for output in final_res.outputs:
+            token_ids = output.token_ids
+            top_logprobs = output.logprobs
+            if request.logprobs:
+                logprobs = self._create_chat_logprobs(
+                    token_ids=token_ids,
+                    top_logprobs=top_logprobs,
+                    num_output_top_logprobs=request.top_logprobs,
+                )
+            else:
+                logprobs = None
+            if request.tool_choice and type(
+                    request.tool_choice) is ChatCompletionNamedToolChoiceParam:
+                message = ChatMessage(
+                    role=role,
+                    content="",
+                    tool_calls=[
+                        ToolCall(function=FunctionCall(
+                            name=request.tool_choice.function.name,
+                            arguments=output.text))
+                    ])
+            elif not request.tool_choice or request.tool_choice == "none":
+                message = ChatMessage(role=role, content=output.text)
+            choice_data = ChatCompletionResponseChoice(
+                index=output.index,
+                message=message,
+                logprobs=logprobs,
+                finish_reason=output.finish_reason,
+                stop_reason=output.stop_reason)
+            choices.append(choice_data)
+        if request.echo:
+            last_msg_content = ""
+            if conversation and conversation[-1].get(
+                    "content") and conversation[-1].get("role") == role:
+                last_msg_content = conversation[-1]["content"]
+            for choice in choices:
+                full_message = last_msg_content + choice.message.content
+                choice.message.content = full_message
+        num_prompt_tokens = len(final_res.prompt_token_ids)
+        num_generated_tokens = sum(
+            len(output.token_ids) for output in final_res.outputs)
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+        )
+        response = ChatCompletionResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=choices,
+            usage=usage,
+        )
+        return response
+    def _get_top_logprobs(
+            self, logprobs: Dict[int, Logprob],
+            top_logprobs: Optional[int]) -> List[ChatCompletionLogProb]:
+        return [
+            ChatCompletionLogProb(
+                token=self._get_decoded_token(p[1], p[0]),
+                logprob=max(p[1].logprob, -9999.0),
+                bytes=list(
+                    self._get_decoded_token(p[1],
+                                            p[0]).encode("utf-8",
+                                                         errors="replace")))
+            for i, p in enumerate(logprobs.items())
+            if top_logprobs and i < top_logprobs
+        ]
+    def _create_chat_logprobs(
+        self,
+        token_ids: GenericSequence[int],
+        top_logprobs: GenericSequence[Optional[Dict[int, Logprob]]],
+        num_output_top_logprobs: Optional[int] = None,
+    ) -> ChatCompletionLogProbs:
+        """Create OpenAI-style logprobs."""
+        logprobs_content = []
+        for i, token_id in enumerate(token_ids):
+            step_top_logprobs = top_logprobs[i]
+            if step_top_logprobs is None:
+                logprobs_content.append(
+                    ChatCompletionLogProbsContent(
+                        token=self.tokenizer.decode(token_id),
+                        bytes=list(
+                            self.tokenizer.decode(token_id).encode(
+                                "utf-8", errors="replace"))))
+            else:
+                logprobs_content.append(
+                    ChatCompletionLogProbsContent(
+                        token=step_top_logprobs[token_id].decoded_token,
+                        logprob=max(step_top_logprobs[token_id].logprob,
+                                    -9999.0),
+                        bytes=list(
+                            step_top_logprobs[token_id].decoded_token.encode(
+                                "utf-8", errors="replace")),
+                        top_logprobs=self._get_top_logprobs(
+                            step_top_logprobs, num_output_top_logprobs)))
+        return ChatCompletionLogProbs(content=logprobs_content)

serving_engine.py CHANGED Viewed

@@ -1,92 +1,81 @@
-import asyncio
 from http import HTTPStatus
-from typing import Dict, List, Optional, Union
-from vllm.logger import init_logger
-from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from protocol import (CompletionRequest,
-                                              ChatCompletionRequest,
-                                              ErrorResponse, LogProbs,
                                               ModelCard, ModelList,
                                               ModelPermission)
 logger = init_logger(__name__)
-class OpenAIServing:
-    def __init__(self, engine: AsyncLLMEngine, served_model: str):
-        self.engine = engine
-        self.served_model = served_model
-        self.max_model_len = 0
-        self.tokenizer = None
-        try:
-            event_loop = asyncio.get_running_loop()
-        except RuntimeError:
-            event_loop = None
-        if event_loop is not None and event_loop.is_running(
-        ):  # If the current is instanced by Ray Serve, there is already a running event loop
-            event_loop.create_task(self._post_init())
-        else:  # When using single vLLM without engine_use_ray
-            asyncio.run(self._post_init())
-    async def _post_init(self):
-        engine_model_config = await self.engine.get_model_config()
-        self.max_model_len = engine_model_config.max_model_len
         # A separate tokenizer to map token IDs to strings.
         self.tokenizer = get_tokenizer(
-            engine_model_config.tokenizer,
-            tokenizer_mode=engine_model_config.tokenizer_mode,
-            trust_remote_code=engine_model_config.trust_remote_code)
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
         model_cards = [
-            ModelCard(id=self.served_model,
-                      root=self.served_model,
                       permission=[ModelPermission()])
         ]
         return ModelList(data=model_cards)
-    def _create_logprobs(
-        self,
-        token_ids: List[int],
-        top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None,
-        num_output_top_logprobs: Optional[int] = None,
-        initial_text_offset: int = 0,
-    ) -> LogProbs:
-        """Create OpenAI-style logprobs."""
-        logprobs = LogProbs()
-        last_token_len = 0
-        if num_output_top_logprobs:
-            logprobs.top_logprobs = []
-        for i, token_id in enumerate(token_ids):
-            step_top_logprobs = top_logprobs[i]
-            if step_top_logprobs is not None:
-                token_logprob = step_top_logprobs[token_id]
-            else:
-                token_logprob = None
-            token = self.tokenizer.convert_ids_to_tokens(token_id)
-            logprobs.tokens.append(token)
-            logprobs.token_logprobs.append(token_logprob)
-            if len(logprobs.text_offset) == 0:
-                logprobs.text_offset.append(initial_text_offset)
-            else:
-                logprobs.text_offset.append(logprobs.text_offset[-1] +
-                                            last_token_len)
-            last_token_len = len(token)
-            if num_output_top_logprobs:
-                logprobs.top_logprobs.append({
-                    self.tokenizer.convert_ids_to_tokens(i): p
-                    for i, p in step_top_logprobs.items()
-                } if step_top_logprobs else None)
-        return logprobs
     def create_error_response(
             self,
             message: str,
@@ -96,38 +85,116 @@ class OpenAIServing:
                              type=err_type,
                              code=status_code.value)
-    async def _check_model(self, request) -> Optional[ErrorResponse]:
-        if request.model == self.served_model:
-            return
         return self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
             status_code=HTTPStatus.NOT_FOUND)
     def _validate_prompt_and_tokenize(
             self,
-            request: Union[ChatCompletionRequest, CompletionRequest],
             prompt: Optional[str] = None,
-            prompt_ids: Optional[List[int]] = None) -> List[int]:
         if not (prompt or prompt_ids):
             raise ValueError("Either prompt or prompt_ids should be provided.")
         if (prompt and prompt_ids):
             raise ValueError(
                 "Only one of prompt or prompt_ids should be provided.")
-        input_ids = prompt_ids if prompt_ids is not None else self.tokenizer(
-            prompt).input_ids
         token_num = len(input_ids)
         if request.max_tokens is None:
             request.max_tokens = self.max_model_len - token_num
         if token_num + request.max_tokens > self.max_model_len:
             raise ValueError(
-                f"This model's maximum context length is {self.max_model_len} tokens. "
-                f"However, you requested {request.max_tokens + token_num} tokens "
                 f"({token_num} in the messages, "
                 f"{request.max_tokens} in the completion). "
                 f"Please reduce the length of the messages or completion.", )
         else:
-            return input_ids

+import json
+from dataclasses import dataclass
 from http import HTTPStatus
+from typing import Any, Dict, List, Optional, Tuple, Union
+from pydantic import Field
+from typing_extensions import Annotated
+from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              CompletionRequest,
+                                              EmbeddingRequest, ErrorResponse,
                                               ModelCard, ModelList,
                                               ModelPermission)
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.sequence import Logprob
+from vllm.transformers_utils.tokenizer import get_tokenizer
 logger = init_logger(__name__)
+@dataclass
+class LoRAModulePath:
+    name: str
+    local_path: str
+class OpenAIServing:
+    def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
+                 served_model_names: List[str],
+                 lora_modules: Optional[List[LoRAModulePath]]):
+        super().__init__()
+        self.engine = engine
+        self.max_model_len = model_config.max_model_len
         # A separate tokenizer to map token IDs to strings.
         self.tokenizer = get_tokenizer(
+            model_config.tokenizer,
+            tokenizer_mode=model_config.tokenizer_mode,
+            tokenizer_revision=model_config.tokenizer_revision,
+            trust_remote_code=model_config.trust_remote_code,
+            truncation_side="left")
+        self.served_model_names = served_model_names
+        if lora_modules is None:
+            self.lora_requests = []
+        else:
+            self.lora_requests = [
+                LoRARequest(
+                    lora_name=lora.name,
+                    lora_int_id=i,
+                    lora_local_path=lora.local_path,
+                ) for i, lora in enumerate(lora_modules, start=1)
+            ]
     async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
         model_cards = [
+            ModelCard(id=served_model_name,
+                      max_model_len=self.max_model_len,
+                      root=self.served_model_names[0],
+                      permission=[ModelPermission()])
+            for served_model_name in self.served_model_names
+        ]
+        lora_cards = [
+            ModelCard(id=lora.lora_name,
+                      root=self.served_model_names[0],
                       permission=[ModelPermission()])
+            for lora in self.lora_requests
         ]
+        model_cards.extend(lora_cards)
         return ModelList(data=model_cards)
     def create_error_response(
             self,
             message: str,
                              type=err_type,
                              code=status_code.value)
+    def create_streaming_error_response(
+            self,
+            message: str,
+            err_type: str = "BadRequestError",
+            status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> str:
+        json_str = json.dumps({
+            "error":
+            self.create_error_response(message=message,
+                                       err_type=err_type,
+                                       status_code=status_code).model_dump()
+        })
+        return json_str
+    async def _check_model(
+        self, request: Union[CompletionRequest, ChatCompletionRequest,
+                             EmbeddingRequest]
+    ) -> Optional[ErrorResponse]:
+        if request.model in self.served_model_names:
+            return None
+        if request.model in [lora.lora_name for lora in self.lora_requests]:
+            return None
         return self.create_error_response(
             message=f"The model `{request.model}` does not exist.",
             err_type="NotFoundError",
             status_code=HTTPStatus.NOT_FOUND)
+    def _maybe_get_lora(
+        self, request: Union[CompletionRequest, ChatCompletionRequest,
+                             EmbeddingRequest]
+    ) -> Optional[LoRARequest]:
+        if request.model in self.served_model_names:
+            return None
+        for lora in self.lora_requests:
+            if request.model == lora.lora_name:
+                return lora
+        # if _check_model has been called earlier, this will be unreachable
+        raise ValueError(f"The model `{request.model}` does not exist.")
     def _validate_prompt_and_tokenize(
             self,
+            request: Union[ChatCompletionRequest, CompletionRequest,
+                           EmbeddingRequest],
             prompt: Optional[str] = None,
+            prompt_ids: Optional[List[int]] = None,
+            truncate_prompt_tokens: Optional[Annotated[int,
+                                                       Field(ge=1)]] = None,
+            add_special_tokens: Optional[bool] = True
+    ) -> Tuple[List[int], str]:
         if not (prompt or prompt_ids):
             raise ValueError("Either prompt or prompt_ids should be provided.")
         if (prompt and prompt_ids):
             raise ValueError(
                 "Only one of prompt or prompt_ids should be provided.")
+        if prompt_ids is None:
+            # When using OpenAIServingChat for chat completions, for
+            # most models the special tokens (e.g., BOS) have already
+            # been added by the chat template. Therefore, we do not
+            # need to add them again.
+            # Set add_special_tokens to False (by default) to avoid
+            # adding the BOS tokens again.
+            tokenizer_kwargs: Dict[str, Any] = {
+                "add_special_tokens": add_special_tokens
+            }
+            if truncate_prompt_tokens is not None:
+                tokenizer_kwargs.update({
+                    "truncation": True,
+                    "max_length": truncate_prompt_tokens,
+                })
+            input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
+        elif truncate_prompt_tokens is not None:
+            input_ids = prompt_ids[-truncate_prompt_tokens:]
+        else:
+            input_ids = prompt_ids
+        input_text = prompt if prompt is not None else self.tokenizer.decode(
+            prompt_ids)
         token_num = len(input_ids)
+        # Note: EmbeddingRequest doesn't have max_tokens
+        if isinstance(request, EmbeddingRequest):
+            if token_num > self.max_model_len:
+                raise ValueError(
+                    f"This model's maximum context length is "
+                    f"{self.max_model_len} tokens. However, you requested "
+                    f"{token_num} tokens in the input for embedding "
+                    f"generation. Please reduce the length of the input.", )
+            return input_ids, input_text
         if request.max_tokens is None:
+            if token_num >= self.max_model_len:
+                raise ValueError(
+                    f"This model's maximum context length is "
+                    f"{self.max_model_len} tokens. However, you requested "
+                    f"{token_num} tokens in the messages, "
+                    f"Please reduce the length of the messages.", )
             request.max_tokens = self.max_model_len - token_num
         if token_num + request.max_tokens > self.max_model_len:
             raise ValueError(
+                f"This model's maximum context length is "
+                f"{self.max_model_len} tokens. However, you requested "
+                f"{request.max_tokens + token_num} tokens "
                 f"({token_num} in the messages, "
                 f"{request.max_tokens} in the completion). "
                 f"Please reduce the length of the messages or completion.", )
         else:
+            return input_ids, input_text
+    def _get_decoded_token(self, logprob: Logprob, token_id: int) -> str:
+        if logprob.decoded_token is not None:
+            return logprob.decoded_token
+        return self.tokenizer.decode(token_id)