File size: 1,472 Bytes
87ce8f2
 
 
 
 
8a770f2
87ce8f2
 
 
 
 
 
 
 
 
 
 
 
 
3e3aa94
8a770f2
 
3e3aa94
 
 
8a770f2
 
 
3e3aa94
 
8a770f2
3e3aa94
 
 
 
 
 
 
 
 
8a770f2
3e3aa94
87ce8f2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from typing import Dict, List, Any
import sys
sys.path.append('./')
from videollama2 import model_init, mm_infer
from videollama2.utils import disable_torch_init
import logging

class EndpointHandler:
    def __init__(self, path: str = ""):
        """
        Initialize the handler by loading the model and any other necessary components.
        
        Args:
            path (str): The path to the model or other necessary files.
        """
        disable_torch_init()
        self.model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
        self.model, self.processor, self.tokenizer = model_init(self.model_path)

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        logging.info(f"Received data: {data}")  # Debugging: Print received data

        modal = data.get("modal", "video")
        modal_path = data.get("modal_path", "")
        instruct = data.get("instruct", "")

        logging.info(f"Modal: {modal}, Modal Path: {modal_path}, Instruct: {instruct}")  # Debugging: Print extracted values

        if not modal_path or not instruct:
            raise ValueError("Both 'modal_path' and 'instruct' must be provided in the input data.")

        # Perform inference
        output = mm_infer(
            self.processor[modal](modal_path), 
            instruct, 
            model=self.model, 
            tokenizer=self.tokenizer, 
            do_sample=False, 
            modal=modal
        )

        return [{"output": output}]