import asyncio from http import HTTPStatus from typing import Dict, List, Optional, Union from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.engine.async_llm_engine import AsyncLLMEngine from protocol import (CompletionRequest, ChatCompletionRequest, ErrorResponse, LogProbs, ModelCard, ModelList, ModelPermission) logger = init_logger(__name__) class OpenAIServing: def __init__(self, engine: AsyncLLMEngine, served_model: str): self.engine = engine self.served_model = served_model self.max_model_len = 0 self.tokenizer = None try: event_loop = asyncio.get_running_loop() except RuntimeError: event_loop = None if event_loop is not None and event_loop.is_running( ): # If the current is instanced by Ray Serve, there is already a running event loop event_loop.create_task(self._post_init()) else: # When using single vLLM without engine_use_ray asyncio.run(self._post_init()) async def _post_init(self): engine_model_config = await self.engine.get_model_config() self.max_model_len = engine_model_config.max_model_len # A separate tokenizer to map token IDs to strings. self.tokenizer = get_tokenizer( engine_model_config.tokenizer, tokenizer_mode=engine_model_config.tokenizer_mode, trust_remote_code=engine_model_config.trust_remote_code) async def show_available_models(self) -> ModelList: """Show available models. Right now we only have one model.""" model_cards = [ ModelCard(id=self.served_model, root=self.served_model, permission=[ModelPermission()]) ] return ModelList(data=model_cards) def _create_logprobs( self, token_ids: List[int], top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None, num_output_top_logprobs: Optional[int] = None, initial_text_offset: int = 0, ) -> LogProbs: """Create OpenAI-style logprobs.""" logprobs = LogProbs() last_token_len = 0 if num_output_top_logprobs: logprobs.top_logprobs = [] for i, token_id in enumerate(token_ids): step_top_logprobs = top_logprobs[i] if step_top_logprobs is not None: token_logprob = step_top_logprobs[token_id] else: token_logprob = None token = self.tokenizer.convert_ids_to_tokens(token_id) logprobs.tokens.append(token) logprobs.token_logprobs.append(token_logprob) if len(logprobs.text_offset) == 0: logprobs.text_offset.append(initial_text_offset) else: logprobs.text_offset.append(logprobs.text_offset[-1] + last_token_len) last_token_len = len(token) if num_output_top_logprobs: logprobs.top_logprobs.append({ self.tokenizer.convert_ids_to_tokens(i): p for i, p in step_top_logprobs.items() } if step_top_logprobs else None) return logprobs def create_error_response( self, message: str, err_type: str = "BadRequestError", status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: return ErrorResponse(message=message, type=err_type, code=status_code.value) async def _check_model(self, request) -> Optional[ErrorResponse]: if request.model == self.served_model: return return self.create_error_response( message=f"The model `{request.model}` does not exist.", err_type="NotFoundError", status_code=HTTPStatus.NOT_FOUND) def _validate_prompt_and_tokenize( self, request: Union[ChatCompletionRequest, CompletionRequest], prompt: Optional[str] = None, prompt_ids: Optional[List[int]] = None) -> List[int]: if not (prompt or prompt_ids): raise ValueError("Either prompt or prompt_ids should be provided.") if (prompt and prompt_ids): raise ValueError( "Only one of prompt or prompt_ids should be provided.") input_ids = prompt_ids if prompt_ids is not None else self.tokenizer( prompt).input_ids token_num = len(input_ids) if request.max_tokens is None: request.max_tokens = self.max_model_len - token_num if token_num + request.max_tokens > self.max_model_len: raise ValueError( f"This model's maximum context length is {self.max_model_len} tokens. " f"However, you requested {request.max_tokens + token_num} tokens " f"({token_num} in the messages, " f"{request.max_tokens} in the completion). " f"Please reduce the length of the messages or completion.", ) else: return input_ids