Spaces:
Sleeping
Sleeping
File size: 1,965 Bytes
6f6830f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import os
import cv2
import pdb
import torch
import numpy as np
from PIL import Image
from transformers import CLIPProcessor, CLIPVisionModelWithProjection
from transformers import logging
logging.set_verbosity_error()
class FeatureExtractor():
def __init__(self, args):
self.device = args.feature_extractor_device
self.beta = args.beta
self.processor = CLIPProcessor.from_pretrained(args.feature_extractor)
self.model = CLIPVisionModelWithProjection.from_pretrained(args.feature_extractor).to(self.device)
self.data_dir = args.data_dir
self.tmp_dir = args.tmp_dir
def __call__(self, video_path, video_id):
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
video_length = frame_count / fps
sample_rate = int(fps) * self.beta
save_path = os.path.join(self.tmp_dir, video_id + '.npz')
if os.path.exists(save_path):
data = np.load(save_path)
clip_features = data['features']
return clip_features, video_length
clip_features = []
print("Extract the clip feature.")
while True:
ret, frame = cap.read()
if not ret:
break
if cap.get(cv2.CAP_PROP_POS_FRAMES) % sample_rate == 0:
image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
inputs = self.processor(images=image, return_tensors="pt").pixel_values
inputs = inputs.to(self.device)
with torch.no_grad():
feat = self.model(inputs)['image_embeds']
clip_features.append(feat.cpu().numpy())
print("Finished.")
clip_features = np.concatenate(clip_features, axis=0)
np.savez_compressed(save_path, features=clip_features)
return clip_features, video_length
|