File size: 2,857 Bytes

87ce8f2

from typing import Dict, List, Any
import sys
sys.path.append('./')
from videollama2 import model_init, mm_infer
from videollama2.utils import disable_torch_init

class EndpointHandler:
    def __init__(self, path: str = ""):
        """
        Initialize the handler by loading the model and any other necessary components.
        
        Args:
            path (str): The path to the model or other necessary files.
        """
        disable_torch_init()
        self.model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
        self.model, self.processor, self.tokenizer = model_init(self.model_path)

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Handle inference requests.
        
        Args:
            data (Dict[str, Any]): The input data for inference. Expected keys:
                                   - 'modal' (str): 'video' or 'image'
                                   - 'modal_path' (str): Path to the video or image file
                                   - 'instruct' (str): The instruction/query to process

        Returns:
            List[Dict[str, Any]]: The output of the inference.
        """
        modal = data.get("modal", "video")
        modal_path = data.get("modal_path", "")
        instruct = data.get("instruct", "")

        if not modal_path or not instruct:
            raise ValueError("Both 'modal_path' and 'instruct' must be provided in the input data.")

        # Perform inference
        output = mm_infer(
            self.processor[modal](modal_path), 
            instruct, 
            model=self.model, 
            tokenizer=self.tokenizer, 
            do_sample=False, 
            modal=modal
        )

        return [{"output": output}]


# from transformers import pipeline

# class EndpointHandler:
#     def __init__(self, path: str = ""):
#         """
#         Initialize the handler by setting up the environment and loading the model.
#         """
#         # Use a pipeline as a high-level helper to download and load the model
#         self.pipe = pipeline("visual-question-answering", model="DAMO-NLP-SG/VideoLLaMA2-8x7B")
#         print("Model downloaded and pipeline created successfully.")

#     def __call__(self, data):
#         """
#         Handle inference requests.
        
#         Args:
#             data (dict): Input data containing 'image' and 'question'.
        
#         Returns:
#             dict: The output from the model.
#         """
#         image = data.get("image")
#         question = data.get("question")

#         if not image or not question:
#             raise ValueError("Both 'image' and 'question' must be provided in the input data.")

#         # Use the pipeline to perform visual question answering
#         output = self.pipe(image=image, question=question)
        
#         return output