{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "a436c0a1-3410-4a7f-a186-9246075ac815", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoModel\n", "model=AutoModel.from_pretrained(\"OpenGVLab/ViCLIP-L-14-hf\",trust_remote_code=True)\n", "tokenizer = model.tokenizer\n", "model_tokenizer={\"viclip\":model,\"tokenizer\":tokenizer}\n", "print(\"done\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "a425a5da-ceaf-4b89-9845-c8ba576902d8", "metadata": {}, "outputs": [], "source": [ "# video data\n", "import numpy as np\n", "import os\n", "import cv2\n", "import torch\n", "def _frame_from_video(video):\n", " while video.isOpened():\n", " success, frame = video.read()\n", " if success:\n", " yield frame\n", " else:\n", " break\n", "video = cv2.VideoCapture('example1.mp4')\n", "frames = [x for x in _frame_from_video(video)]" ] }, { "cell_type": "code", "execution_count": 5, "id": "aac775ce", "metadata": {}, "outputs": [], "source": [ "# function\n", "\n", "def get_text_feat_dict(texts, clip, tokenizer, text_feat_d={}):\n", " for t in texts:\n", " feat = clip.get_text_features(t, tokenizer, text_feat_d)\n", " text_feat_d[t] = feat\n", " return text_feat_d\n", "\n", "def get_vid_feat(frames, clip):\n", " return clip.get_vid_features(frames)\n", "\n", "v_mean = np.array([0.485, 0.456, 0.406]).reshape(1,1,3)\n", "v_std = np.array([0.229, 0.224, 0.225]).reshape(1,1,3)\n", "def normalize(data):\n", " return (data/255.0-v_mean)/v_std\n", "\n", "def frames2tensor(vid_list, fnum=8, target_size=(224, 224), device=torch.device('cuda')):\n", " assert(len(vid_list) >= fnum)\n", " step = len(vid_list) // fnum\n", " vid_list = vid_list[::step][:fnum]\n", " vid_list = [cv2.resize(x[:,:,::-1], target_size) for x in vid_list]\n", " vid_tube = [np.expand_dims(normalize(x), axis=(0, 1)) for x in vid_list]\n", " vid_tube = np.concatenate(vid_tube, axis=1)\n", " vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3))\n", " vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float()\n", " return vid_tube\n", "def retrieve_text(frames, \n", " texts, \n", " models={'viclip':None, \n", " 'tokenizer':None},\n", " topk=5, \n", " device=torch.device('cuda')):\n", " # clip, tokenizer = get_clip(name, model_cfg['size'], model_cfg['pretrained'], model_cfg['reload'])\n", " assert(type(models)==dict and models['viclip'] is not None and models['tokenizer'] is not None)\n", " clip, tokenizer = models['viclip'], models['tokenizer']\n", " clip = clip.to(device)\n", " frames_tensor = frames2tensor(frames, device=device)\n", " vid_feat = get_vid_feat(frames_tensor, clip)\n", "\n", " text_feat_d = {}\n", " text_feat_d = get_text_feat_dict(texts, clip, tokenizer, text_feat_d)\n", " text_feats = [text_feat_d[t] for t in texts]\n", " text_feats_tensor = torch.cat(text_feats, 0)\n", " \n", " probs, idxs = clip.get_predict_label(vid_feat, text_feats_tensor, top=topk)\n", "\n", " ret_texts = [texts[i] for i in idxs.numpy()[0].tolist()]\n", " return ret_texts, probs.numpy()[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "a2969ba6-19d0-4893-b071-b82fa046c312", "metadata": {}, "outputs": [], "source": [ "# retrieval\n", "text_candidates = [\"A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.\",\n", " \"A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.\",\n", " \"A person dressed in a blue jacket shovels the snow-covered pavement outside their house.\",\n", " \"A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.\",\n", " \"A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.\",\n", " \"A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.\",\n", " \"A playful dog slides down a snowy hill, wagging its tail with delight.\",\n", " \"A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.\",\n", " \"A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.\",\n", " \"A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery.\"]\n", "texts, probs = retrieve_text(frames, text_candidates, models=model_tokenizer, topk=5)\n", "\n", "for t, p in zip(texts, probs):\n", " print(f'text: {t} ~ prob: {p:.4f}')\n", " \n", "# text: A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run. ~ prob: 0.8333\n", "# text: A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon. ~ prob: 0.1266\n", "# text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.0368\n", "# text: A person dressed in a blue jacket shovels the snow-covered pavement outside their house. ~ prob: 0.0030\n", "# text: A playful dog slides down a snowy hill, wagging its tail with delight. ~ prob: 0.0003" ] }, { "cell_type": "code", "execution_count": null, "id": "84922de7-b41c-41c1-87a0-b28e52da9b5d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" } }, "nbformat": 4, "nbformat_minor": 5 }