Spaces:
Runtime error
Runtime error
Amy Roberts
commited on
Commit
•
f2b92aa
1
Parent(s):
424ae15
Finalise
Browse files- .gitattributes +4 -0
- README.md +2 -2
- app.py +151 -0
- examples/bed.mp4 +3 -0
- examples/book.mp4 +3 -0
- examples/food.mp4 +3 -0
- requirements.txt +8 -0
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
examples/ filter=lfs diff=lfs merge=lfs -text
|
37 |
+
examples/bed.mp4 filter=lfs diff=lfs merge=lfs -text
|
38 |
+
examples/book.mp4 filter=lfs diff=lfs merge=lfs -text
|
39 |
+
examples/food.mp4 filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
colorFrom: purple
|
5 |
colorTo: pink
|
6 |
sdk: gradio
|
|
|
1 |
---
|
2 |
+
title: TVP
|
3 |
+
emoji: ❓📽️
|
4 |
colorFrom: purple
|
5 |
colorTo: pink
|
6 |
sdk: gradio
|
app.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import av
|
4 |
+
import cv2
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
import gradio as gr
|
8 |
+
from transformers import AutoProcessor, TvpForVideoGrounding
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
def pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
|
13 |
+
'''
|
14 |
+
Convert the video from its original fps to the target_fps and decode the video with PyAV decoder.
|
15 |
+
Args:
|
16 |
+
container (container): pyav container.
|
17 |
+
sampling_rate (int): frame sampling rate (interval between two sampled frames).
|
18 |
+
num_frames (int): number of frames to sample.
|
19 |
+
clip_idx (int): if clip_idx is -1, perform random temporal sampling.
|
20 |
+
If clip_idx is larger than -1, uniformly split the video to num_clips
|
21 |
+
clips, and select the clip_idx-th video clip.
|
22 |
+
num_clips (int): overall number of clips to uniformly sample from the given video.
|
23 |
+
target_fps (int): the input video may have different fps, convert it to
|
24 |
+
the target video fps before frame sampling.
|
25 |
+
Returns:
|
26 |
+
frames (tensor): decoded frames from the video. Return None if the no
|
27 |
+
video stream was found.
|
28 |
+
fps (float): the number of frames per second of the video.
|
29 |
+
'''
|
30 |
+
video = container.streams.video[0]
|
31 |
+
fps = float(video.average_rate)
|
32 |
+
clip_size = sampling_rate * num_frames / target_fps * fps
|
33 |
+
delta = max(num_frames - clip_size, 0)
|
34 |
+
start_idx = delta * clip_idx / num_clips
|
35 |
+
end_idx = start_idx + clip_size - 1
|
36 |
+
timebase = video.duration / num_frames
|
37 |
+
video_start_pts = int(start_idx * timebase)
|
38 |
+
video_end_pts = int(end_idx * timebase)
|
39 |
+
seek_offset = max(video_start_pts - 1024, 0)
|
40 |
+
container.seek(seek_offset, any_frame=False, backward=True, stream=video)
|
41 |
+
frames = {}
|
42 |
+
for frame in container.decode(video=0):
|
43 |
+
if frame.pts < video_start_pts:
|
44 |
+
continue
|
45 |
+
frames[frame.pts] = frame
|
46 |
+
if frame.pts > video_end_pts:
|
47 |
+
break
|
48 |
+
frames = [frames[pts] for pts in sorted(frames)]
|
49 |
+
return frames, fps
|
50 |
+
|
51 |
+
|
52 |
+
def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
|
53 |
+
'''
|
54 |
+
Decode the video and perform temporal sampling.
|
55 |
+
Args:
|
56 |
+
container (container): pyav container.
|
57 |
+
sampling_rate (int): frame sampling rate (interval between two sampled frames).
|
58 |
+
num_frames (int): number of frames to sample.
|
59 |
+
clip_idx (int): if clip_idx is -1, perform random temporal sampling.
|
60 |
+
If clip_idx is larger than -1, uniformly split the video to num_clips
|
61 |
+
clips, and select the clip_idx-th video clip.
|
62 |
+
num_clips (int): overall number of clips to uniformly sample from the given video.
|
63 |
+
target_fps (int): the input video may have different fps, convert it to
|
64 |
+
the target video fps before frame sampling.
|
65 |
+
Returns:
|
66 |
+
frames (tensor): decoded frames from the video.
|
67 |
+
'''
|
68 |
+
assert clip_idx >= -2, "Not a valied clip_idx {}".format(clip_idx)
|
69 |
+
frames, fps = pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps)
|
70 |
+
clip_size = sampling_rate * num_frames / target_fps * fps
|
71 |
+
index = np.linspace(0, clip_size - 1, num_frames)
|
72 |
+
index = np.clip(index, 0, len(frames) - 1).astype(np.int64)
|
73 |
+
frames = np.array([frames[idx].to_rgb().to_ndarray() for idx in index])
|
74 |
+
frames = frames.transpose(0, 3, 1, 2)
|
75 |
+
return frames
|
76 |
+
|
77 |
+
|
78 |
+
def get_video_duration(filename):
|
79 |
+
cap = cv2.VideoCapture(filename)
|
80 |
+
if cap.isOpened():
|
81 |
+
rate = cap.get(5)
|
82 |
+
frame_num = cap.get(7)
|
83 |
+
duration = frame_num/rate
|
84 |
+
return duration
|
85 |
+
return -1
|
86 |
+
|
87 |
+
|
88 |
+
def predict_durations(model_checkpoint, text, video_filename, device="cpu"):
|
89 |
+
print(f"Loading model: {model_checkpoint}")
|
90 |
+
model = TvpForVideoGrounding.from_pretrained(model_checkpoint)
|
91 |
+
processor = AutoProcessor.from_pretrained(model_checkpoint)
|
92 |
+
print(f"Loading video: {video_filename}")
|
93 |
+
raw_sampled_frames = decode(
|
94 |
+
container=av.open(video_filename, metadata_errors="ignore"),
|
95 |
+
sampling_rate=1,
|
96 |
+
num_frames=model.config.num_frames,
|
97 |
+
clip_idx=0,
|
98 |
+
num_clips=1,
|
99 |
+
target_fps=3,
|
100 |
+
)
|
101 |
+
print("Processing video and text")
|
102 |
+
model_inputs = processor(
|
103 |
+
text=[text], videos=list(raw_sampled_frames), return_tensors="pt", max_text_length=100
|
104 |
+
).to(device)
|
105 |
+
# model_inputs["pixel_values"] = model_inputs["pixel_values"].to(model.dtype)
|
106 |
+
print("Running inference")
|
107 |
+
output = model(**model_inputs)
|
108 |
+
duration = get_video_duration(video_filename)
|
109 |
+
start, end = processor.post_process_video_grounding(output.logits, duration)
|
110 |
+
return f"start: {start}s, end: {end}s"
|
111 |
+
|
112 |
+
|
113 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
114 |
+
DEVICE = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
|
115 |
+
MODELS = ["Intel/tvp-base", "Intel/tvp-base-ANet"]
|
116 |
+
EXAMPLES = [
|
117 |
+
["./examples/bed.mp4", "a person is sitting on a bed."],
|
118 |
+
["./examples/food.mp4", "a person eats some food."],
|
119 |
+
["./examples/book.mp4", "a person reads a book."],
|
120 |
+
]
|
121 |
+
|
122 |
+
model_checkpoint = gr.Dropdown(MODELS, label="Model", value=MODELS[0], type="value")
|
123 |
+
video_in = gr.Video(label="Video File", elem_id="video_in")
|
124 |
+
text_in = gr.Textbox(label="Text", placeholder="Description of event in the video", interactive=True)
|
125 |
+
text_out = gr.Textbox(label="Prediction", placeholder="Predicted start and end time")
|
126 |
+
|
127 |
+
|
128 |
+
title = "Video Grounding with TVP"
|
129 |
+
DESCRIPTION = """# Video Grounding with TVP"""
|
130 |
+
css = """.toast-wrap { display: none !important } """
|
131 |
+
with gr.Blocks(title=title) as demo:
|
132 |
+
gr.Markdown(DESCRIPTION)
|
133 |
+
with gr.Row():
|
134 |
+
model_checkpoint.render()
|
135 |
+
|
136 |
+
with gr.Row():
|
137 |
+
examples = gr.Examples(examples=EXAMPLES, inputs=[video_in, text_in])
|
138 |
+
|
139 |
+
with gr.Row():
|
140 |
+
with gr.Column():
|
141 |
+
video_in.render()
|
142 |
+
|
143 |
+
with gr.Column():
|
144 |
+
text_in.render()
|
145 |
+
time_button = gr.Button("Get start and end time")
|
146 |
+
time_button.click(predict_durations, inputs=[model_checkpoint, text_in, video_in], outputs=[text_out])
|
147 |
+
text_out.render()
|
148 |
+
|
149 |
+
|
150 |
+
if __name__ == "__main__":
|
151 |
+
demo.launch(debug=True)
|
examples/bed.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:43c45bc34c842328323fa568ea65c1ff6df9686e3a9d93f804bb4ab2b241d5c7
|
3 |
+
size 1156824
|
examples/book.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72bb9a4e2843233591252da02790f57f4194bfe5f13278c4e74d00e77590d984
|
3 |
+
size 740098
|
examples/food.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:271940f652b2d51ce19f130731079e1a899eae1df15b1ecab0323c917962d539
|
3 |
+
size 4077619
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
av
|
2 |
+
huggingface_hub
|
3 |
+
gradio
|
4 |
+
numpy
|
5 |
+
transformers @ git+https://github.com/huggingface/transformers.git
|
6 |
+
torch
|
7 |
+
opencv-python
|
8 |
+
Pillow
|