Spaces:
Sleeping
Sleeping
from .simple_tokenizer import SimpleTokenizer as _Tokenizer | |
from .viclip import ViCLIP | |
import torch | |
import numpy as np | |
import cv2 | |
import os | |
def get_viclip(size='l', | |
pretrain=os.path.join(os.path.dirname(os.path.abspath(__file__)), "ViClip-InternVid-10M-FLT.pth")): | |
tokenizer = _Tokenizer() | |
vclip = ViCLIP(tokenizer=tokenizer, size=size, pretrain=pretrain) | |
m = {'viclip':vclip, 'tokenizer':tokenizer} | |
return m | |
def get_text_feat_dict(texts, clip, tokenizer, text_feat_d={}): | |
for t in texts: | |
feat = clip.get_text_features(t, tokenizer, text_feat_d) | |
text_feat_d[t] = feat | |
return text_feat_d | |
def get_vid_feat(frames, clip): | |
return clip.get_vid_features(frames) | |
def _frame_from_video(video): | |
while video.isOpened(): | |
success, frame = video.read() | |
if success: | |
yield frame | |
else: | |
break | |
v_mean = np.array([0.485, 0.456, 0.406]).reshape(1,1,3) | |
v_std = np.array([0.229, 0.224, 0.225]).reshape(1,1,3) | |
def normalize(data): | |
return (data/255.0-v_mean)/v_std | |
def frames2tensor(vid_list, fnum=8, target_size=(224, 224), device=torch.device('cuda')): | |
assert(len(vid_list) >= fnum) | |
step = len(vid_list) // fnum | |
vid_list = vid_list[::step][:fnum] | |
vid_list = [cv2.resize(x[:,:,::-1], target_size) for x in vid_list] | |
vid_tube = [np.expand_dims(normalize(x), axis=(0, 1)) for x in vid_list] | |
vid_tube = np.concatenate(vid_tube, axis=1) | |
vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3)) | |
vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float() | |
return vid_tube | |
def retrieve_text(frames, | |
texts, | |
models={'viclip':None, | |
'tokenizer':None}, | |
topk=5, | |
device=torch.device('cuda')): | |
# clip, tokenizer = get_clip(name, model_cfg['size'], model_cfg['pretrained'], model_cfg['reload']) | |
assert(type(models)==dict and models['viclip'] is not None and models['tokenizer'] is not None) | |
clip, tokenizer = models['viclip'], models['tokenizer'] | |
clip = clip.to(device) | |
frames_tensor = frames2tensor(frames, device=device) | |
vid_feat = get_vid_feat(frames_tensor, clip) | |
text_feat_d = {} | |
text_feat_d = get_text_feat_dict(texts, clip, tokenizer, text_feat_d) | |
text_feats = [text_feat_d[t] for t in texts] | |
text_feats_tensor = torch.cat(text_feats, 0) | |
probs, idxs = clip.get_predict_label(vid_feat, text_feats_tensor, top=topk) | |
ret_texts = [texts[i] for i in idxs.numpy()[0].tolist()] | |
return ret_texts, probs.numpy()[0] |