Spaces:
Running
on
A10G
Running
on
A10G
#@title Get bounding boxes for the subject | |
from transformers import pipeline | |
from moviepy.editor import VideoFileClip | |
from PIL import Image | |
import os | |
import numpy as np | |
import matplotlib.pyplot as plt | |
import tqdm | |
import pickle | |
import torch | |
checkpoint = "google/owlvit-large-patch14" | |
detector = pipeline(model=checkpoint, task="zero-shot-object-detection", device='cuda:0') | |
def get_bounding_boxes(clip_path, subject): | |
# Read video from the path | |
clip = VideoFileClip(clip_path) | |
all_bboxes = [] | |
bbox_present = [] | |
num_bb = 0 | |
for fidx,frame in enumerate(clip.iter_frames()): | |
frame = Image.fromarray(frame) | |
predictions = detector( | |
frame, | |
candidate_labels=[subject,], | |
) | |
try: | |
bbox = predictions[0]["box"] | |
bbox = (bbox["xmin"], bbox["ymin"], bbox["xmax"], bbox["ymax"]) | |
# Get a zeros array of the same size as the frame | |
canvas = np.zeros(frame.size[::-1]) | |
# Draw the bounding box on the canvas | |
canvas[bbox[1]:bbox[3], bbox[0]:bbox[2]] = 1 | |
# Add the canvas to the list of bounding boxes | |
all_bboxes.append(canvas) | |
bbox_present.append(True) | |
num_bb += 1 | |
except Exception as e: | |
# Append an empty canvas, we will interpolate later | |
all_bboxes.append(np.zeros(frame.size[::-1])) | |
bbox_present.append(False) | |
continue | |
return all_bboxes, num_bb | |
import pickle as pkl | |
dir_path = '/your/result/path' | |
video_filename = '2_of_40_2.mp4' | |
output_bbox = [] | |
with open("/ssv2dataset/path.pkl", "rb") as f: | |
data = pkl.load(f) | |
dataset_size = len(data) | |
failed_cnt = 0 | |
for i, d in tqdm.tqdm(enumerate(data)): | |
try: | |
# print(f"{d['subject']} || {d['caption']} || {d['video']}") | |
filename = d['video'].split('.')[0] | |
video_path = os.path.join(dir_path, filename, video_filename) | |
fg_object = d['subject'] | |
masks, num_bb = get_bounding_boxes(video_path, fg_object) | |
output_bbox.append({ | |
'caption': d['caption'], | |
'video': d['video'], | |
'subject': d['subject'], | |
'mask': masks, | |
'num_bb': num_bb | |
}) | |
# print(num_bb) | |
except: | |
print(f"Missed #{i} with Caption: {d['caption']}") | |
failed_cnt += 1 | |
with open(f"/output/path/iou_eval/ssv2_modelscope_{video_filename.split('.')[0]}_bbox-v2.pkl", "wb") as f: | |
pkl.dump(output_bbox, f) | |
print(f"Failed: {failed_cnt} out of {dataset_size}") |