Devops-hestabit
/

mixtral-instruct-trt-quant

Model card Files Files and versions Community

mixtral-instruct-trt-quant / tensorrt_llm_bls /1 /model.py

Devops-hestabit

Upload folder using huggingface_hub

61e6a6c verified 5 months ago

raw

history blame contribute delete

5.44 kB

	# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions
	# are met:
	# * Redistributions of source code must retain the above copyright
	# notice, this list of conditions and the following disclaimer.
	# * Redistributions in binary form must reproduce the above copyright
	# notice, this list of conditions and the following disclaimer in the
	# documentation and/or other materials provided with the distribution.
	# * Neither the name of NVIDIA CORPORATION nor the names of its
	# contributors may be used to endorse or promote products derived
	# from this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
	# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
	# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
	# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
	# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
	# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	import json
	import traceback

	import triton_python_backend_utils as pb_utils
	from lib.triton_decoder import TritonDecoder


	class TritonPythonModel:

	def initialize(self, args):

	# Parse model configs
	model_config = json.loads(args['model_config'])

	params = model_config['parameters']

	accumulate_tokens_str = ''
	if 'accumulate_tokens' in params:
	accumulate_tokens_str = params['accumulate_tokens']['string_value']

	self.accumulate_tokens = accumulate_tokens_str.lower() in [
	'true', 'yes', '1', 't'
	]

	self.decoupled = pb_utils.using_decoupled_model_transaction_policy(
	model_config)

	self.logger = pb_utils.Logger

	self.llm_model_name = "tensorrt_llm"
	if "tensorrt_llm_model_name" in params:
	self.llm_model_name = params["tensorrt_llm_model_name"][
	"string_value"]
	self.draft_llm_model_name = None
	if "tensorrt_llm_draft_model_name" in params:
	self.draft_llm_model_name = params[
	"tensorrt_llm_draft_model_name"]["string_value"]

	self.decoder = TritonDecoder(
	streaming=self.decoupled,
	accumulate=self.accumulate_tokens,
	preproc_model_name="preprocessing",
	postproc_model_name="postprocessing",
	llm_model_name=self.llm_model_name,
	draft_llm_model_name=self.draft_llm_model_name)

	def execute(self, requests):

	responses = []

	for request in requests:
	if self.decoupled:
	response_sender = request.get_response_sender()
	try:

	req = self.decoder.convert_triton_request(request)
	req.validate()
	speculative_decode = (req.num_draft_tokens is not None
	and req.num_draft_tokens[0][0] > 0)
	if speculative_decode and (self.draft_llm_model_name is None
	or self.draft_llm_model_name == ""):
	raise Exception(
	"cannot perform speculative decoding without draft model"
	)
	res_gen = self.decoder.decode(
	req, speculative_decoding=speculative_decode)

	for res in res_gen:
	triton_response = self.decoder.create_triton_response(res)
	if self.decoupled:
	response_sender.send(triton_response)
	else:
	responses.append(triton_response)

	if self.decoupled:
	response_sender.send(
	flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)

	except Exception:
	self.logger.log_error(traceback.format_exc())
	# If encountering an error, send a response with err msg
	error_response = pb_utils.InferenceResponse(
	output_tensors=[],
	error=pb_utils.TritonError(traceback.format_exc()))

	if self.decoupled:
	response_sender.send(error_response)
	response_sender.send(
	flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
	else:
	responses.append(error_response)

	self.decoder.reset_decoder()
	if self.decoupled:
	return None
	else:
	assert len(responses) == len(requests)
	return responses

	def finalize(self):
	"""`finalize` is called only once when the model is being unloaded.
	Implementing `finalize` function is optional. This function allows
	the model to perform any necessary clean ups before exit.
	"""
	print('Cleaning up...')