Aliayub1995
/

VideoLLaMA2-7B

Visual Question Answering

videollama2_mistral

text-generation

multimodal large language model

large video-language model

Inference Endpoints

Model card Files Files and versions Community

VideoLLaMA2-7B / handler.py

Aliayub1995's picture

Update handler.py

a0a5594 verified 3 months ago

1.35 kB

	from typing import Dict, List, Any
	import sys
	sys.path.append('./')
	from videollama2 import model_init, mm_infer
	from videollama2.utils import disable_torch_init
	import logging
	import numpy as np

	class EndpointHandler:
	def __init__(self, path: str = ""):
	"""
	Initialize the handler by loading the model and any other necessary components.

	Args:
	path (str): The path to the model or other necessary files.
	"""
	disable_torch_init()
	self.model_path = 'Aliayub1995/VideoLLaMA2-7B'
	self.model, self.processor, self.tokenizer = model_init(self.model_path)

	def __call__(self, video_tensor: np.ndarray) -> List[Dict[str, Any]]:
	logging.info("Received video tensor") # Debugging: Confirm video tensor received

	# Default values
	modal = "video"
	instruct = "Can you explain each scene and provide the exact time of the video in which it happened in this format [start_time: end_time]: Description, [start_time: end_time]: Description ..."

	# Perform inference
	output = mm_infer(
	self.processor[modal](video_tensor),
	instruct,
	model=self.model,
	tokenizer=self.tokenizer,
	do_sample=False,
	modal=modal
	)

	return [{"output": output}]