|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
name: "ensemble" |
|
platform: "ensemble" |
|
max_batch_size: 16 |
|
input [ |
|
{ |
|
name: "text_input" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
}, |
|
{ |
|
name: "decoder_text_input" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "max_tokens" |
|
data_type: TYPE_INT32 |
|
dims: [ -1 ] |
|
}, |
|
{ |
|
name: "bad_words" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "stop_words" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "end_id" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "pad_id" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "top_k" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "top_p" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "temperature" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "length_penalty" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "repetition_penalty" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "min_length" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "presence_penalty" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "frequency_penalty" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "random_seed" |
|
data_type: TYPE_UINT64 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "return_log_probs" |
|
data_type: TYPE_BOOL |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "return_context_logits" |
|
data_type: TYPE_BOOL |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "return_generation_logits" |
|
data_type: TYPE_BOOL |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "beam_width" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "stream" |
|
data_type: TYPE_BOOL |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "prompt_embedding_table" |
|
data_type: TYPE_FP16 |
|
dims: [ -1, -1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "prompt_vocab_size" |
|
data_type: TYPE_INT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "embedding_bias_words" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "embedding_bias_weights" |
|
data_type: TYPE_FP32 |
|
dims: [ -1 ] |
|
optional: true |
|
} |
|
] |
|
output [ |
|
{ |
|
name: "text_output" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
}, |
|
{ |
|
name: "cum_log_probs" |
|
data_type: TYPE_FP32 |
|
dims: [ -1 ] |
|
}, |
|
{ |
|
name: "output_log_probs" |
|
data_type: TYPE_FP32 |
|
dims: [ -1, -1 ] |
|
}, |
|
{ |
|
name: "context_logits" |
|
data_type: TYPE_FP32 |
|
dims: [ -1, -1 ] |
|
}, |
|
{ |
|
name: "generation_logits" |
|
data_type: TYPE_FP32 |
|
dims: [ -1, -1, -1 ] |
|
} |
|
] |
|
ensemble_scheduling { |
|
step [ |
|
{ |
|
model_name: "preprocessing" |
|
model_version: -1 |
|
input_map { |
|
key: "QUERY" |
|
value: "text_input" |
|
} |
|
input_map { |
|
key: "DECODER_QUERY" |
|
value: "decoder_text_input" |
|
} |
|
input_map { |
|
key: "REQUEST_OUTPUT_LEN" |
|
value: "max_tokens" |
|
} |
|
input_map { |
|
key: "BAD_WORDS_DICT" |
|
value: "bad_words" |
|
} |
|
input_map { |
|
key: "STOP_WORDS_DICT" |
|
value: "stop_words" |
|
} |
|
input_map { |
|
key: "EMBEDDING_BIAS_WORDS" |
|
value: "embedding_bias_words" |
|
} |
|
input_map { |
|
key: "EMBEDDING_BIAS_WEIGHTS" |
|
value: "embedding_bias_weights" |
|
} |
|
input_map { |
|
key: "END_ID" |
|
value: "end_id" |
|
} |
|
input_map { |
|
key: "PAD_ID" |
|
value: "pad_id" |
|
} |
|
output_map { |
|
key: "REQUEST_INPUT_LEN" |
|
value: "_REQUEST_INPUT_LEN" |
|
} |
|
output_map { |
|
key: "INPUT_ID" |
|
value: "_INPUT_ID" |
|
} |
|
output_map { |
|
key: "REQUEST_DECODER_INPUT_LEN" |
|
value: "_REQUEST_DECODER_INPUT_LEN" |
|
} |
|
output_map { |
|
key: "DECODER_INPUT_ID" |
|
value: "_DECODER_INPUT_ID" |
|
} |
|
output_map { |
|
key: "REQUEST_OUTPUT_LEN" |
|
value: "_REQUEST_OUTPUT_LEN" |
|
} |
|
output_map { |
|
key: "STOP_WORDS_IDS" |
|
value: "_STOP_WORDS_IDS" |
|
} |
|
output_map { |
|
key: "BAD_WORDS_IDS" |
|
value: "_BAD_WORDS_IDS" |
|
} |
|
output_map { |
|
key: "EMBEDDING_BIAS" |
|
value: "_EMBEDDING_BIAS" |
|
} |
|
output_map { |
|
key: "OUT_END_ID" |
|
value: "_PREPROCESSOR_END_ID" |
|
} |
|
output_map { |
|
key: "OUT_PAD_ID" |
|
value: "_PREPROCESSOR_PAD_ID" |
|
} |
|
}, |
|
{ |
|
model_name: "tensorrt_llm" |
|
model_version: -1 |
|
input_map { |
|
key: "input_ids" |
|
value: "_INPUT_ID" |
|
} |
|
input_map { |
|
key: "decoder_input_ids" |
|
value: "_DECODER_INPUT_ID" |
|
} |
|
input_map { |
|
key: "input_lengths" |
|
value: "_REQUEST_INPUT_LEN" |
|
} |
|
input_map { |
|
key: "decoder_input_lengths" |
|
value: "_REQUEST_DECODER_INPUT_LEN" |
|
} |
|
input_map { |
|
key: "request_output_len" |
|
value: "_REQUEST_OUTPUT_LEN" |
|
} |
|
input_map { |
|
key: "end_id" |
|
value: "_PREPROCESSOR_END_ID" |
|
} |
|
input_map { |
|
key: "pad_id" |
|
value: "_PREPROCESSOR_PAD_ID" |
|
} |
|
input_map { |
|
key: "embedding_bias" |
|
value: "_EMBEDDING_BIAS" |
|
} |
|
input_map { |
|
key: "runtime_top_k" |
|
value: "top_k" |
|
} |
|
input_map { |
|
key: "runtime_top_p" |
|
value: "top_p" |
|
} |
|
input_map { |
|
key: "temperature" |
|
value: "temperature" |
|
} |
|
input_map { |
|
key: "len_penalty" |
|
value: "length_penalty" |
|
} |
|
input_map { |
|
key: "repetition_penalty" |
|
value: "repetition_penalty" |
|
} |
|
input_map { |
|
key: "min_length" |
|
value: "min_length" |
|
} |
|
input_map { |
|
key: "presence_penalty" |
|
value: "presence_penalty" |
|
} |
|
input_map { |
|
key: "frequency_penalty" |
|
value: "frequency_penalty" |
|
} |
|
input_map { |
|
key: "random_seed" |
|
value: "random_seed" |
|
} |
|
input_map { |
|
key: "return_log_probs" |
|
value: "return_log_probs" |
|
} |
|
input_map { |
|
key: "return_context_logits" |
|
value: "return_context_logits" |
|
} |
|
input_map { |
|
key: "return_generation_logits" |
|
value: "return_generation_logits" |
|
} |
|
input_map { |
|
key: "beam_width" |
|
value: "beam_width" |
|
} |
|
input_map { |
|
key: "streaming" |
|
value: "stream" |
|
} |
|
input_map { |
|
key: "prompt_embedding_table" |
|
value: "prompt_embedding_table" |
|
} |
|
input_map { |
|
key: "prompt_vocab_size" |
|
value: "prompt_vocab_size" |
|
} |
|
input_map { |
|
key: "stop_words_list" |
|
value: "_STOP_WORDS_IDS" |
|
} |
|
input_map { |
|
key: "bad_words_list" |
|
value: "_BAD_WORDS_IDS" |
|
} |
|
output_map { |
|
key: "output_ids" |
|
value: "_TOKENS_BATCH" |
|
} |
|
output_map { |
|
key: "sequence_length" |
|
value: "_SEQUENCE_LENGTH" |
|
}, |
|
output_map { |
|
key: "cum_log_probs" |
|
value: "_CUM_LOG_PROBS" |
|
} |
|
output_map { |
|
key: "output_log_probs" |
|
value: "_OUTPUT_LOG_PROBS" |
|
}, |
|
output_map { |
|
key: "context_logits" |
|
value: "_CONTEXT_LOGITS" |
|
}, |
|
output_map { |
|
key: "generation_logits" |
|
value: "_GENERATION_LOGITS" |
|
} |
|
}, |
|
{ |
|
model_name: "postprocessing" |
|
model_version: -1 |
|
input_map { |
|
key: "TOKENS_BATCH" |
|
value: "_TOKENS_BATCH" |
|
} |
|
input_map { |
|
key: "CUM_LOG_PROBS" |
|
value: "_CUM_LOG_PROBS" |
|
} |
|
input_map { |
|
key: "OUTPUT_LOG_PROBS" |
|
value: "_OUTPUT_LOG_PROBS" |
|
} |
|
input_map { |
|
key: "CONTEXT_LOGITS" |
|
value: "_CONTEXT_LOGITS" |
|
} |
|
input_map { |
|
key: "GENERATION_LOGITS" |
|
value: "_GENERATION_LOGITS" |
|
} |
|
input_map { |
|
key: "SEQUENCE_LENGTH" |
|
value: "_SEQUENCE_LENGTH" |
|
} |
|
output_map { |
|
key: "OUTPUT" |
|
value: "text_output" |
|
} |
|
output_map { |
|
key: "OUT_OUTPUT_LOG_PROBS" |
|
value: "output_log_probs" |
|
} |
|
output_map { |
|
key: "OUT_CUM_LOG_PROBS" |
|
value: "cum_log_probs" |
|
} |
|
output_map { |
|
key: "OUT_CONTEXT_LOGITS" |
|
value: "context_logits" |
|
} |
|
output_map { |
|
key: "OUT_GENERATION_LOGITS" |
|
value: "generation_logits" |
|
} |
|
} |
|
] |
|
} |
|
|