|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
name: "ensemble" |
|
platform: "ensemble" |
|
max_batch_size: 128 |
|
input [ |
|
{ |
|
name: "text_input" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
}, |
|
{ |
|
name: "max_tokens" |
|
data_type: TYPE_UINT32 |
|
dims: [ -1 ] |
|
}, |
|
{ |
|
name: "bad_words" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
}, |
|
{ |
|
name: "stop_words" |
|
data_type: TYPE_STRING |
|
dims: [ -1 ] |
|
}, |
|
{ |
|
name: "end_id" |
|
data_type: TYPE_UINT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "pad_id" |
|
data_type: TYPE_UINT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "top_k" |
|
data_type: TYPE_UINT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "top_p" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "temperature" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "length_penalty" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "repetition_penalty" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "min_length" |
|
data_type: TYPE_UINT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "presence_penalty" |
|
data_type: TYPE_FP32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "random_seed" |
|
data_type: TYPE_UINT64 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "beam_width" |
|
data_type: TYPE_UINT32 |
|
dims: [ 1 ] |
|
optional: true |
|
}, |
|
{ |
|
name: "stream" |
|
data_type: TYPE_BOOL |
|
dims: [ 1 ] |
|
optional: true |
|
} |
|
] |
|
output [ |
|
{ |
|
name: "text_output" |
|
data_type: TYPE_STRING |
|
dims: [ -1, -1 ] |
|
}, |
|
{ |
|
name: "output_tokens" |
|
data_type: TYPE_UINT32 |
|
dims: [ -1 ] |
|
} |
|
] |
|
ensemble_scheduling { |
|
step [ |
|
{ |
|
model_name: "preprocessing" |
|
model_version: -1 |
|
input_map { |
|
key: "QUERY" |
|
value: "text_input" |
|
} |
|
input_map { |
|
key: "REQUEST_OUTPUT_LEN" |
|
value: "max_tokens" |
|
} |
|
input_map { |
|
key: "BAD_WORDS_DICT" |
|
value: "bad_words" |
|
} |
|
input_map { |
|
key: "STOP_WORDS_DICT" |
|
value: "stop_words" |
|
} |
|
output_map { |
|
key: "REQUEST_INPUT_LEN" |
|
value: "_REQUEST_INPUT_LEN" |
|
} |
|
output_map { |
|
key: "INPUT_ID" |
|
value: "_INPUT_ID" |
|
} |
|
output_map { |
|
key: "REQUEST_OUTPUT_LEN" |
|
value: "_REQUEST_OUTPUT_LEN" |
|
} |
|
}, |
|
{ |
|
model_name: "tensorrt_llm" |
|
model_version: -1 |
|
input_map { |
|
key: "input_ids" |
|
value: "_INPUT_ID" |
|
} |
|
input_map { |
|
key: "input_lengths" |
|
value: "_REQUEST_INPUT_LEN" |
|
} |
|
input_map { |
|
key: "request_output_len" |
|
value: "_REQUEST_OUTPUT_LEN" |
|
} |
|
input_map { |
|
key: "end_id" |
|
value: "end_id" |
|
} |
|
input_map { |
|
key: "pad_id" |
|
value: "pad_id" |
|
} |
|
input_map { |
|
key: "runtime_top_k" |
|
value: "top_k" |
|
} |
|
input_map { |
|
key: "runtime_top_p" |
|
value: "top_p" |
|
} |
|
input_map { |
|
key: "temperature" |
|
value: "temperature" |
|
} |
|
input_map { |
|
key: "len_penalty" |
|
value: "length_penalty" |
|
} |
|
input_map { |
|
key: "repetition_penalty" |
|
value: "repetition_penalty" |
|
} |
|
input_map { |
|
key: "min_length" |
|
value: "min_length" |
|
} |
|
input_map { |
|
key: "presence_penalty" |
|
value: "presence_penalty" |
|
} |
|
input_map { |
|
key: "random_seed" |
|
value: "random_seed" |
|
} |
|
input_map { |
|
key: "beam_width" |
|
value: "beam_width" |
|
} |
|
input_map { |
|
key: "streaming" |
|
value: "stream" |
|
} |
|
output_map { |
|
key: "output_ids" |
|
value: "_TOKENS_BATCH" |
|
} |
|
}, |
|
{ |
|
model_name: "postprocessing" |
|
model_version: -1 |
|
input_map { |
|
key: "TOKENS_BATCH" |
|
value: "_TOKENS_BATCH" |
|
} |
|
output_map { |
|
key: "OUTPUT" |
|
value: "text_output" |
|
} |
|
output_map { |
|
key: "OUTPUT_LENS" |
|
value: "output_tokens" |
|
} |
|
} |
|
] |
|
} |
|
|