Aliayub1995
/

VideoLLaMA2-7B

Visual Question Answering

videollama2_mistral

text-generation

multimodal large language model

large video-language model

Inference Endpoints

Model card Files Files and versions Community

Aliayub1995 commited on Sep 4, 2024

Commit

7095ee5

·

verified ·

1 Parent(s): a0a5594

Update videollama2/init.py

Files changed (1) hide show

videollama2/__init__.py +4 -2

videollama2/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@ import copy
 import warnings
 import shutil
 from functools import partial
 import torch
 import logging
@@ -32,7 +33,7 @@ def model_init(model_path=None, **kwargs):
     return model, processor, tokenizer
-def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs):
     """inference api of VideoLLaMA2 for video understanding.
     Args:
@@ -45,7 +46,8 @@ def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs
     Returns:
         str: response of the model.
     """
     # 1. text preprocess (tag process & generate prompt).
     if modal == 'image':
         modal_token = DEFAULT_IMAGE_TOKEN

 import warnings
 import shutil
 from functools import partial
+import gdown
 import torch
 import logging
     return model, processor, tokenizer
+def mm_infer(url, instruct, model, tokenizer, modal='video', **kwargs):
     """inference api of VideoLLaMA2 for video understanding.
     Args:
     Returns:
         str: response of the model.
     """
+    image_or_video = 'Temp.mp4'
+    gdown.download(url, output, quiet=False)
     # 1. text preprocess (tag process & generate prompt).
     if modal == 'image':
         modal_token = DEFAULT_IMAGE_TOKEN