from typing import List, Any
import sys
sys.path.append('./')
from videollama2 import model_init, mm_infer
from videollama2.utils import disable_torch_init
import logging
import numpy as np

class EndpointHandler:
    def __init__(self, path: str = ""):
        """
        Initialize the handler by loading the model and any other necessary components.
        
        Args:
            path (str): The path to the model or other necessary files.
        """
        disable_torch_init()
        self.model_path = 'Aliayub1995/VideoLLaMA2-7B'
        self.model, self.processor, self.tokenizer = model_init(self.model_path)

    def __call__(self, video_tensor: np.ndarray) -> List[Dict[str, Any]]:
        logging.info("Received video tensor")  # Debugging: Confirm video tensor received

        # Default values
        modal = "video"
        instruct = "Can you explain each scene and provide the exact time of the video in which it happened in this format [start_time: end_time]: Description, [start_time: end_time]: Description ..."

        # Perform inference
        output = mm_infer(
            self.processor[modal](video_tensor), 
            instruct, 
            model=self.model, 
            tokenizer=self.tokenizer, 
            do_sample=False, 
            modal=modal
        )

        return [{"output": output}]