# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of NVIDIA CORPORATION nor the names of its # contributors may be used to endorse or promote products derived # from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. name: "ensemble" platform: "ensemble" max_batch_size: 128 input [ { name: "text_input" data_type: TYPE_STRING dims: [ -1 ] }, { name: "max_tokens" data_type: TYPE_UINT32 dims: [ -1 ] }, { name: "bad_words" data_type: TYPE_STRING dims: [ -1 ] }, { name: "stop_words" data_type: TYPE_STRING dims: [ -1 ] }, { name: "end_id" data_type: TYPE_UINT32 dims: [ 1 ] optional: true }, { name: "pad_id" data_type: TYPE_UINT32 dims: [ 1 ] optional: true }, { name: "top_k" data_type: TYPE_UINT32 dims: [ 1 ] optional: true }, { name: "top_p" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "temperature" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "length_penalty" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "repetition_penalty" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "min_length" data_type: TYPE_UINT32 dims: [ 1 ] optional: true }, { name: "presence_penalty" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "random_seed" data_type: TYPE_UINT64 dims: [ 1 ] optional: true }, { name: "beam_width" data_type: TYPE_UINT32 dims: [ 1 ] optional: true }, { name: "stream" data_type: TYPE_BOOL dims: [ 1 ] optional: true } ] output [ { name: "text_output" data_type: TYPE_STRING dims: [ -1, -1 ] }, { name: "output_tokens" data_type: TYPE_UINT32 dims: [ -1 ] } ] ensemble_scheduling { step [ { model_name: "preprocessing" model_version: -1 input_map { key: "QUERY" value: "text_input" } input_map { key: "REQUEST_OUTPUT_LEN" value: "max_tokens" } input_map { key: "BAD_WORDS_DICT" value: "bad_words" } input_map { key: "STOP_WORDS_DICT" value: "stop_words" } output_map { key: "REQUEST_INPUT_LEN" value: "_REQUEST_INPUT_LEN" } output_map { key: "INPUT_ID" value: "_INPUT_ID" } output_map { key: "REQUEST_OUTPUT_LEN" value: "_REQUEST_OUTPUT_LEN" } }, { model_name: "tensorrt_llm" model_version: -1 input_map { key: "input_ids" value: "_INPUT_ID" } input_map { key: "input_lengths" value: "_REQUEST_INPUT_LEN" } input_map { key: "request_output_len" value: "_REQUEST_OUTPUT_LEN" } input_map { key: "end_id" value: "end_id" } input_map { key: "pad_id" value: "pad_id" } input_map { key: "runtime_top_k" value: "top_k" } input_map { key: "runtime_top_p" value: "top_p" } input_map { key: "temperature" value: "temperature" } input_map { key: "len_penalty" value: "length_penalty" } input_map { key: "repetition_penalty" value: "repetition_penalty" } input_map { key: "min_length" value: "min_length" } input_map { key: "presence_penalty" value: "presence_penalty" } input_map { key: "random_seed" value: "random_seed" } input_map { key: "beam_width" value: "beam_width" } input_map { key: "streaming" value: "stream" } output_map { key: "output_ids" value: "_TOKENS_BATCH" } }, { model_name: "postprocessing" model_version: -1 input_map { key: "TOKENS_BATCH" value: "_TOKENS_BATCH" } output_map { key: "OUTPUT" value: "text_output" } output_map { key: "OUTPUT_LENS" value: "output_tokens" } } ] }