from typing import List, Any import sys sys.path.append('./') from videollama2 import model_init, mm_infer from videollama2.utils import disable_torch_init import logging import numpy as np class EndpointHandler: def __init__(self, path: str = ""): """ Initialize the handler by loading the model and any other necessary components. Args: path (str): The path to the model or other necessary files. """ disable_torch_init() self.model_path = 'Aliayub1995/VideoLLaMA2-7B' self.model, self.processor, self.tokenizer = model_init(self.model_path) def __call__(self, video_tensor: np.ndarray) -> List[Dict[str, Any]]: logging.info("Received video tensor") # Debugging: Confirm video tensor received # Default values modal = "video" instruct = "Can you explain each scene and provide the exact time of the video in which it happened in this format [start_time: end_time]: Description, [start_time: end_time]: Description ..." # Perform inference output = mm_infer( self.processor[modal](video_tensor), instruct, model=self.model, tokenizer=self.tokenizer, do_sample=False, modal=modal ) return [{"output": output}]