# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. name: "ensemble" platform: "ensemble" max_batch_size: 16 input [ { name: "text_input" data_type: TYPE_STRING dims: [ -1 ] }, { name: "decoder_text_input" data_type: TYPE_STRING dims: [ -1 ] optional: true }, { name: "max_tokens" data_type: TYPE_INT32 dims: [ -1 ] }, { name: "bad_words" data_type: TYPE_STRING dims: [ -1 ] optional: true }, { name: "stop_words" data_type: TYPE_STRING dims: [ -1 ] optional: true }, { name: "end_id" data_type: TYPE_INT32 dims: [ 1 ] optional: true }, { name: "pad_id" data_type: TYPE_INT32 dims: [ 1 ] optional: true }, { name: "top_k" data_type: TYPE_INT32 dims: [ 1 ] optional: true }, { name: "top_p" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "temperature" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "length_penalty" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "repetition_penalty" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "min_length" data_type: TYPE_INT32 dims: [ 1 ] optional: true }, { name: "presence_penalty" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "frequency_penalty" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "random_seed" data_type: TYPE_UINT64 dims: [ 1 ] optional: true }, { name: "return_log_probs" data_type: TYPE_BOOL dims: [ 1 ] optional: true }, { name: "return_context_logits" data_type: TYPE_BOOL dims: [ 1 ] optional: true }, { name: "return_generation_logits" data_type: TYPE_BOOL dims: [ 1 ] optional: true }, { name: "beam_width" data_type: TYPE_INT32 dims: [ 1 ] optional: true }, { name: "stream" data_type: TYPE_BOOL dims: [ 1 ] optional: true }, { name: "prompt_embedding_table" data_type: TYPE_FP16 dims: [ -1, -1 ] optional: true }, { name: "prompt_vocab_size" data_type: TYPE_INT32 dims: [ 1 ] optional: true }, { name: "embedding_bias_words" data_type: TYPE_STRING dims: [ -1 ] optional: true }, { name: "embedding_bias_weights" data_type: TYPE_FP32 dims: [ -1 ] optional: true } ] output [ { name: "text_output" data_type: TYPE_STRING dims: [ -1 ] }, { name: "cum_log_probs" data_type: TYPE_FP32 dims: [ -1 ] }, { name: "output_log_probs" data_type: TYPE_FP32 dims: [ -1, -1 ] }, { name: "context_logits" data_type: TYPE_FP32 dims: [ -1, -1 ] }, { name: "generation_logits" data_type: TYPE_FP32 dims: [ -1, -1, -1 ] } ] ensemble_scheduling { step [ { model_name: "preprocessing" model_version: -1 input_map { key: "QUERY" value: "text_input" } input_map { key: "DECODER_QUERY" value: "decoder_text_input" } input_map { key: "REQUEST_OUTPUT_LEN" value: "max_tokens" } input_map { key: "BAD_WORDS_DICT" value: "bad_words" } input_map { key: "STOP_WORDS_DICT" value: "stop_words" } input_map { key: "EMBEDDING_BIAS_WORDS" value: "embedding_bias_words" } input_map { key: "EMBEDDING_BIAS_WEIGHTS" value: "embedding_bias_weights" } input_map { key: "END_ID" value: "end_id" } input_map { key: "PAD_ID" value: "pad_id" } output_map { key: "REQUEST_INPUT_LEN" value: "_REQUEST_INPUT_LEN" } output_map { key: "INPUT_ID" value: "_INPUT_ID" } output_map { key: "REQUEST_DECODER_INPUT_LEN" value: "_REQUEST_DECODER_INPUT_LEN" } output_map { key: "DECODER_INPUT_ID" value: "_DECODER_INPUT_ID" } output_map { key: "REQUEST_OUTPUT_LEN" value: "_REQUEST_OUTPUT_LEN" } output_map { key: "STOP_WORDS_IDS" value: "_STOP_WORDS_IDS" } output_map { key: "BAD_WORDS_IDS" value: "_BAD_WORDS_IDS" } output_map { key: "EMBEDDING_BIAS" value: "_EMBEDDING_BIAS" } output_map { key: "OUT_END_ID" value: "_PREPROCESSOR_END_ID" } output_map { key: "OUT_PAD_ID" value: "_PREPROCESSOR_PAD_ID" } }, { model_name: "tensorrt_llm" model_version: -1 input_map { key: "input_ids" value: "_INPUT_ID" } input_map { key: "decoder_input_ids" value: "_DECODER_INPUT_ID" } input_map { key: "input_lengths" value: "_REQUEST_INPUT_LEN" } input_map { key: "decoder_input_lengths" value: "_REQUEST_DECODER_INPUT_LEN" } input_map { key: "request_output_len" value: "_REQUEST_OUTPUT_LEN" } input_map { key: "end_id" value: "_PREPROCESSOR_END_ID" } input_map { key: "pad_id" value: "_PREPROCESSOR_PAD_ID" } input_map { key: "embedding_bias" value: "_EMBEDDING_BIAS" } input_map { key: "runtime_top_k" value: "top_k" } input_map { key: "runtime_top_p" value: "top_p" } input_map { key: "temperature" value: "temperature" } input_map { key: "len_penalty" value: "length_penalty" } input_map { key: "repetition_penalty" value: "repetition_penalty" } input_map { key: "min_length" value: "min_length" } input_map { key: "presence_penalty" value: "presence_penalty" } input_map { key: "frequency_penalty" value: "frequency_penalty" } input_map { key: "random_seed" value: "random_seed" } input_map { key: "return_log_probs" value: "return_log_probs" } input_map { key: "return_context_logits" value: "return_context_logits" } input_map { key: "return_generation_logits" value: "return_generation_logits" } input_map { key: "beam_width" value: "beam_width" } input_map { key: "streaming" value: "stream" } input_map { key: "prompt_embedding_table" value: "prompt_embedding_table" } input_map { key: "prompt_vocab_size" value: "prompt_vocab_size" } input_map { key: "stop_words_list" value: "_STOP_WORDS_IDS" } input_map { key: "bad_words_list" value: "_BAD_WORDS_IDS" } output_map { key: "output_ids" value: "_TOKENS_BATCH" } output_map { key: "sequence_length" value: "_SEQUENCE_LENGTH" }, output_map { key: "cum_log_probs" value: "_CUM_LOG_PROBS" } output_map { key: "output_log_probs" value: "_OUTPUT_LOG_PROBS" }, output_map { key: "context_logits" value: "_CONTEXT_LOGITS" }, output_map { key: "generation_logits" value: "_GENERATION_LOGITS" } }, { model_name: "postprocessing" model_version: -1 input_map { key: "TOKENS_BATCH" value: "_TOKENS_BATCH" } input_map { key: "CUM_LOG_PROBS" value: "_CUM_LOG_PROBS" } input_map { key: "OUTPUT_LOG_PROBS" value: "_OUTPUT_LOG_PROBS" } input_map { key: "CONTEXT_LOGITS" value: "_CONTEXT_LOGITS" } input_map { key: "GENERATION_LOGITS" value: "_GENERATION_LOGITS" } input_map { key: "SEQUENCE_LENGTH" value: "_SEQUENCE_LENGTH" } output_map { key: "OUTPUT" value: "text_output" } output_map { key: "OUT_OUTPUT_LOG_PROBS" value: "output_log_probs" } output_map { key: "OUT_CUM_LOG_PROBS" value: "cum_log_probs" } output_map { key: "OUT_CONTEXT_LOGITS" value: "context_logits" } output_map { key: "OUT_GENERATION_LOGITS" value: "generation_logits" } } ] }