|
from typing import Dict, List, Any |
|
import sys |
|
sys.path.append('./') |
|
from videollama2 import model_init, mm_infer |
|
from videollama2.utils import disable_torch_init |
|
|
|
class EndpointHandler: |
|
def __init__(self, path: str = ""): |
|
""" |
|
Initialize the handler by loading the model and any other necessary components. |
|
|
|
Args: |
|
path (str): The path to the model or other necessary files. |
|
""" |
|
disable_torch_init() |
|
self.model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B' |
|
self.model, self.processor, self.tokenizer = model_init(self.model_path) |
|
|
|
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: |
|
""" |
|
Handle inference requests. |
|
|
|
Args: |
|
data (Dict[str, Any]): The input data for inference. Expected keys: |
|
- 'modal' (str): 'video' or 'image' |
|
- 'modal_path' (str): Path to the video or image file |
|
- 'instruct' (str): The instruction/query to process |
|
|
|
Returns: |
|
List[Dict[str, Any]]: The output of the inference. |
|
""" |
|
modal = data.get("modal", "video") |
|
modal_path = data.get("modal_path", "") |
|
instruct = data.get("instruct", "") |
|
|
|
if not modal_path or not instruct: |
|
raise ValueError("Both 'modal_path' and 'instruct' must be provided in the input data.") |
|
|
|
|
|
output = mm_infer( |
|
self.processor[modal](modal_path), |
|
instruct, |
|
model=self.model, |
|
tokenizer=self.tokenizer, |
|
do_sample=False, |
|
modal=modal |
|
) |
|
|
|
return [{"output": output}] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|