Update README.md
Browse files
README.md
CHANGED
@@ -103,10 +103,10 @@ def inference(args):
|
|
103 |
audio_video_tensor = preprocess(audio_video_path)
|
104 |
else:
|
105 |
audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
|
106 |
-
question = f"Please describe the video with
|
107 |
|
108 |
# Audio Inference
|
109 |
-
audio_video_path = "assets/
|
110 |
preprocess = processor['audio' if args.modal_type == "a" else "video"]
|
111 |
if args.modal_type == "a":
|
112 |
audio_video_tensor = preprocess(audio_video_path)
|
@@ -115,13 +115,13 @@ def inference(args):
|
|
115 |
question = f"Please describe the audio."
|
116 |
|
117 |
# Video Inference
|
118 |
-
audio_video_path = "assets/
|
119 |
preprocess = processor['audio' if args.modal_type == "a" else "video"]
|
120 |
if args.modal_type == "a":
|
121 |
audio_video_tensor = preprocess(audio_video_path)
|
122 |
else:
|
123 |
audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
|
124 |
-
question = f"
|
125 |
|
126 |
output = mm_infer(
|
127 |
audio_video_tensor,
|
@@ -138,11 +138,12 @@ def inference(args):
|
|
138 |
if __name__ == "__main__":
|
139 |
parser = argparse.ArgumentParser()
|
140 |
|
141 |
-
parser.add_argument('--model-path', help='', required=
|
142 |
parser.add_argument('--modal-type', choices=["a", "v", "av"], help='', required=True)
|
143 |
args = parser.parse_args()
|
144 |
|
145 |
inference(args)
|
|
|
146 |
```
|
147 |
|
148 |
|
|
|
103 |
audio_video_tensor = preprocess(audio_video_path)
|
104 |
else:
|
105 |
audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
|
106 |
+
question = f"Please describe the video with audio information."
|
107 |
|
108 |
# Audio Inference
|
109 |
+
audio_video_path = "assets/bird-twitter-car.wav"
|
110 |
preprocess = processor['audio' if args.modal_type == "a" else "video"]
|
111 |
if args.modal_type == "a":
|
112 |
audio_video_tensor = preprocess(audio_video_path)
|
|
|
115 |
question = f"Please describe the audio."
|
116 |
|
117 |
# Video Inference
|
118 |
+
audio_video_path = "assets/output_v_1jgsRbGzCls.mp4"
|
119 |
preprocess = processor['audio' if args.modal_type == "a" else "video"]
|
120 |
if args.modal_type == "a":
|
121 |
audio_video_tensor = preprocess(audio_video_path)
|
122 |
else:
|
123 |
audio_video_tensor = preprocess(audio_video_path, va=True if args.modal_type == "av" else False)
|
124 |
+
question = f"What activity are the people practicing in the video?"
|
125 |
|
126 |
output = mm_infer(
|
127 |
audio_video_tensor,
|
|
|
138 |
if __name__ == "__main__":
|
139 |
parser = argparse.ArgumentParser()
|
140 |
|
141 |
+
parser.add_argument('--model-path', help='', , required=False, default='DAMO-NLP-SG/VideoLLaMA2.1-7B-AV')
|
142 |
parser.add_argument('--modal-type', choices=["a", "v", "av"], help='', required=True)
|
143 |
args = parser.parse_args()
|
144 |
|
145 |
inference(args)
|
146 |
+
|
147 |
```
|
148 |
|
149 |
|