DeepInfra
/

Llama-2-70b-chat-hf-trt-fp8

Model card Files Files and versions Community

Llama-2-70b-chat-hf-trt-fp8 / ensemble /config.pbtxt

yessenzhar

add smaller files

a83b588 about 1 year ago

raw

history blame

5.66 kB

	# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	# * Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# * Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	# * Neither the name of NVIDIA CORPORATION nor the names of its
	# contributors may be used to endorse or promote products derived
	# from this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
	# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
	# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	name: "ensemble"
	platform: "ensemble"
	max_batch_size: 128
	input [
	{
	name: "text_input"
	data_type: TYPE_STRING
	dims: [ -1 ]
	},
	{
	name: "max_tokens"
	data_type: TYPE_UINT32
	dims: [ -1 ]
	},
	{
	name: "bad_words"
	data_type: TYPE_STRING
	dims: [ -1 ]
	},
	{
	name: "stop_words"
	data_type: TYPE_STRING
	dims: [ -1 ]
	},
	{
	name: "end_id"
	data_type: TYPE_UINT32
	dims: [ 1 ]
	optional: true
	},
	{
	name: "pad_id"
	data_type: TYPE_UINT32
	dims: [ 1 ]
	optional: true
	},
	{
	name: "top_k"
	data_type: TYPE_UINT32
	dims: [ 1 ]
	optional: true
	},
	{
	name: "top_p"
	data_type: TYPE_FP32
	dims: [ 1 ]
	optional: true
	},
	{
	name: "temperature"
	data_type: TYPE_FP32
	dims: [ 1 ]
	optional: true
	},
	{
	name: "length_penalty"
	data_type: TYPE_FP32
	dims: [ 1 ]
	optional: true
	},
	{
	name: "repetition_penalty"
	data_type: TYPE_FP32
	dims: [ 1 ]
	optional: true
	},
	{
	name: "min_length"
	data_type: TYPE_UINT32
	dims: [ 1 ]
	optional: true
	},
	{
	name: "presence_penalty"
	data_type: TYPE_FP32
	dims: [ 1 ]
	optional: true
	},
	{
	name: "random_seed"
	data_type: TYPE_UINT64
	dims: [ 1 ]
	optional: true
	},
	{
	name: "beam_width"
	data_type: TYPE_UINT32
	dims: [ 1 ]
	optional: true
	},
	{
	name: "stream"
	data_type: TYPE_BOOL
	dims: [ 1 ]
	optional: true
	}
	]
	output [
	{
	name: "text_output"
	data_type: TYPE_STRING
	dims: [ -1, -1 ]
	},
	{
	name: "output_tokens"
	data_type: TYPE_UINT32
	dims: [ -1 ]
	}
	]
	ensemble_scheduling {
	step [
	{
	model_name: "preprocessing"
	model_version: -1
	input_map {
	key: "QUERY"
	value: "text_input"
	}
	input_map {
	key: "REQUEST_OUTPUT_LEN"
	value: "max_tokens"
	}
	input_map {
	key: "BAD_WORDS_DICT"
	value: "bad_words"
	}
	input_map {
	key: "STOP_WORDS_DICT"
	value: "stop_words"
	}
	output_map {
	key: "REQUEST_INPUT_LEN"
	value: "_REQUEST_INPUT_LEN"
	}
	output_map {
	key: "INPUT_ID"
	value: "_INPUT_ID"
	}
	output_map {
	key: "REQUEST_OUTPUT_LEN"
	value: "_REQUEST_OUTPUT_LEN"
	}
	},
	{
	model_name: "tensorrt_llm"
	model_version: -1
	input_map {
	key: "input_ids"
	value: "_INPUT_ID"
	}
	input_map {
	key: "input_lengths"
	value: "_REQUEST_INPUT_LEN"
	}
	input_map {
	key: "request_output_len"
	value: "_REQUEST_OUTPUT_LEN"
	}
	input_map {
	key: "end_id"
	value: "end_id"
	}
	input_map {
	key: "pad_id"
	value: "pad_id"
	}
	input_map {
	key: "runtime_top_k"
	value: "top_k"
	}
	input_map {
	key: "runtime_top_p"
	value: "top_p"
	}
	input_map {
	key: "temperature"
	value: "temperature"
	}
	input_map {
	key: "len_penalty"
	value: "length_penalty"
	}
	input_map {
	key: "repetition_penalty"
	value: "repetition_penalty"
	}
	input_map {
	key: "min_length"
	value: "min_length"
	}
	input_map {
	key: "presence_penalty"
	value: "presence_penalty"
	}
	input_map {
	key: "random_seed"
	value: "random_seed"
	}
	input_map {
	key: "beam_width"
	value: "beam_width"
	}
	input_map {
	key: "streaming"
	value: "stream"
	}
	output_map {
	key: "output_ids"
	value: "_TOKENS_BATCH"
	}
	},
	{
	model_name: "postprocessing"
	model_version: -1
	input_map {
	key: "TOKENS_BATCH"
	value: "_TOKENS_BATCH"
	}
	output_map {
	key: "OUTPUT"
	value: "text_output"
	}
	output_map {
	key: "OUTPUT_LENS"
	value: "output_tokens"
	}
	}
	]
	}