Aliayub1995
/

VideoLLaMA2-7B

Visual Question Answering

videollama2_mistral

text-generation

multimodal large language model

large video-language model

Inference Endpoints

Model card Files Files and versions Community

Aliayub1995 commited on Sep 3

Commit

024e8dc

•

1 Parent(s): 40fcb49

Update handler.py

Files changed (1) hide show

handler.py +8 -29

handler.py CHANGED Viewed

@@ -1,10 +1,10 @@
-from typing import Dict, List, Any
 import sys
 sys.path.append('./')
 from videollama2 import model_init, mm_infer
 from videollama2.utils import disable_torch_init
 import logging
-import os
 class EndpointHandler:
     def __init__(self, path: str = ""):
@@ -18,37 +18,16 @@ class EndpointHandler:
         self.model_path = 'Aliayub1995/VideoLLaMA2-7B'
         self.model, self.processor, self.tokenizer = model_init(self.model_path)
-    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
-        logging.info(f"Received data: {data}")  # Debugging: Print received data
-        # Initialize variables
-        current_path = os.getcwd()
-        logging.info(f"Current Path: {current_path}")
-        dir = os.walk("../app")
-        # Iterate through the generator
-        for dirpath, dirnames, filenames in dir:
-            logging.info(f"Current Path: {dirpath}")
-            logging.info(f"Directories: {dirnames}")
-            logging.info(f"Files: {filenames}")
-            logging.info("-" * 40)
-        logging.info(f"Directory struct: {dir}")
-        modal = None
-        modal_path = None
-        instruct = None
-        # Extract input data
-        inputs = data.get("inputs", data)
-        modal = inputs.get("modal", "video")
-        modal_path = inputs.get("modal_path", "")
-        instruct = inputs.get("instruct", "")
-        logging.info(f"Modal: {modal}, Modal Path: {modal_path}, Instruct: {instruct}")  # Debugging: Print extracted values
-        if not modal_path or not instruct:
-            raise ValueError("Both 'modal_path' and 'instruct' must be provided in the input data.")
         # Perform inference
         output = mm_infer(
-            self.processor[modal](modal_path),
             instruct,
             model=self.model,
             tokenizer=self.tokenizer,

+from typing import List, Any
 import sys
 sys.path.append('./')
 from videollama2 import model_init, mm_infer
 from videollama2.utils import disable_torch_init
 import logging
+import numpy as np
 class EndpointHandler:
     def __init__(self, path: str = ""):
         self.model_path = 'Aliayub1995/VideoLLaMA2-7B'
         self.model, self.processor, self.tokenizer = model_init(self.model_path)
+    def __call__(self, video_tensor: np.ndarray) -> List[Dict[str, Any]]:
+        logging.info("Received video tensor")  # Debugging: Confirm video tensor received
+        # Default values
+        modal = "video"
+        instruct = "Can you explain each scene and provide the exact time of the video in which it happened in this format [start_time: end_time]: Description, [start_time: end_time]: Description ..."
         # Perform inference
         output = mm_infer(
+            self.processor[modal](video_tensor),
             instruct,
             model=self.model,
             tokenizer=self.tokenizer,