Aliayub1995
/

VideoLLaMA2-7B

Visual Question Answering

videollama2_mistral

text-generation

multimodal large language model

large video-language model

Inference Endpoints

Model card Files Files and versions Community

VideoLLaMA2-7B / handler.py

Aliayub1995's picture

Upload 52 files

87ce8f2 verified 5 months ago

2.86 kB

	from typing import Dict, List, Any
	import sys
	sys.path.append('./')
	from videollama2 import model_init, mm_infer
	from videollama2.utils import disable_torch_init

	class EndpointHandler:
	def __init__(self, path: str = ""):
	"""
	Initialize the handler by loading the model and any other necessary components.

	Args:
	path (str): The path to the model or other necessary files.
	"""
	disable_torch_init()
	self.model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
	self.model, self.processor, self.tokenizer = model_init(self.model_path)

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	Handle inference requests.

	Args:
	data (Dict[str, Any]): The input data for inference. Expected keys:
	- 'modal' (str): 'video' or 'image'
	- 'modal_path' (str): Path to the video or image file
	- 'instruct' (str): The instruction/query to process

	Returns:
	List[Dict[str, Any]]: The output of the inference.
	"""
	modal = data.get("modal", "video")
	modal_path = data.get("modal_path", "")
	instruct = data.get("instruct", "")

	if not modal_path or not instruct:
	raise ValueError("Both 'modal_path' and 'instruct' must be provided in the input data.")

	# Perform inference
	output = mm_infer(
	self.processor[modal](modal_path),
	instruct,
	model=self.model,
	tokenizer=self.tokenizer,
	do_sample=False,
	modal=modal
	)

	return [{"output": output}]


	# from transformers import pipeline

	# class EndpointHandler:
	# def __init__(self, path: str = ""):
	# """
	# Initialize the handler by setting up the environment and loading the model.
	# """
	# # Use a pipeline as a high-level helper to download and load the model
	# self.pipe = pipeline("visual-question-answering", model="DAMO-NLP-SG/VideoLLaMA2-8x7B")
	# print("Model downloaded and pipeline created successfully.")

	# def __call__(self, data):
	# """
	# Handle inference requests.

	# Args:
	# data (dict): Input data containing 'image' and 'question'.

	# Returns:
	# dict: The output from the model.
	# """
	# image = data.get("image")
	# question = data.get("question")

	# if not image or not question:
	# raise ValueError("Both 'image' and 'question' must be provided in the input data.")

	# # Use the pipeline to perform visual question answering
	# output = self.pipe(image=image, question=question)

	# return output