mazpie's picture
Initial commit
2d9a728
raw
history blame
2.61 kB
from .simple_tokenizer import SimpleTokenizer as _Tokenizer
from .viclip import ViCLIP
import torch
import numpy as np
import cv2
import os
def get_viclip(size='l',
pretrain=os.path.join(os.path.dirname(os.path.abspath(__file__)), "ViClip-InternVid-10M-FLT.pth")):
tokenizer = _Tokenizer()
vclip = ViCLIP(tokenizer=tokenizer, size=size, pretrain=pretrain)
m = {'viclip':vclip, 'tokenizer':tokenizer}
return m
def get_text_feat_dict(texts, clip, tokenizer, text_feat_d={}):
for t in texts:
feat = clip.get_text_features(t, tokenizer, text_feat_d)
text_feat_d[t] = feat
return text_feat_d
def get_vid_feat(frames, clip):
return clip.get_vid_features(frames)
def _frame_from_video(video):
while video.isOpened():
success, frame = video.read()
if success:
yield frame
else:
break
v_mean = np.array([0.485, 0.456, 0.406]).reshape(1,1,3)
v_std = np.array([0.229, 0.224, 0.225]).reshape(1,1,3)
def normalize(data):
return (data/255.0-v_mean)/v_std
def frames2tensor(vid_list, fnum=8, target_size=(224, 224), device=torch.device('cuda')):
assert(len(vid_list) >= fnum)
step = len(vid_list) // fnum
vid_list = vid_list[::step][:fnum]
vid_list = [cv2.resize(x[:,:,::-1], target_size) for x in vid_list]
vid_tube = [np.expand_dims(normalize(x), axis=(0, 1)) for x in vid_list]
vid_tube = np.concatenate(vid_tube, axis=1)
vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3))
vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float()
return vid_tube
def retrieve_text(frames,
texts,
models={'viclip':None,
'tokenizer':None},
topk=5,
device=torch.device('cuda')):
# clip, tokenizer = get_clip(name, model_cfg['size'], model_cfg['pretrained'], model_cfg['reload'])
assert(type(models)==dict and models['viclip'] is not None and models['tokenizer'] is not None)
clip, tokenizer = models['viclip'], models['tokenizer']
clip = clip.to(device)
frames_tensor = frames2tensor(frames, device=device)
vid_feat = get_vid_feat(frames_tensor, clip)
text_feat_d = {}
text_feat_d = get_text_feat_dict(texts, clip, tokenizer, text_feat_d)
text_feats = [text_feat_d[t] for t in texts]
text_feats_tensor = torch.cat(text_feats, 0)
probs, idxs = clip.get_predict_label(vid_feat, text_feats_tensor, top=topk)
ret_texts = [texts[i] for i in idxs.numpy()[0].tolist()]
return ret_texts, probs.numpy()[0]