{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a436c0a1-3410-4a7f-a186-9246075ac815",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoModel\n",
    "model=AutoModel.from_pretrained(\"OpenGVLab/ViCLIP-L-14-hf\",trust_remote_code=True)\n",
    "tokenizer = model.tokenizer\n",
    "model_tokenizer={\"viclip\":model,\"tokenizer\":tokenizer}\n",
    "print(\"done\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "a425a5da-ceaf-4b89-9845-c8ba576902d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# video data\n",
    "import numpy as np\n",
    "import os\n",
    "import cv2\n",
    "import torch\n",
    "def _frame_from_video(video):\n",
    "    while video.isOpened():\n",
    "        success, frame = video.read()\n",
    "        if success:\n",
    "            yield frame\n",
    "        else:\n",
    "            break\n",
    "video = cv2.VideoCapture('example1.mp4')\n",
    "frames = [x for x in _frame_from_video(video)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "aac775ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "# function\n",
    "\n",
    "def get_text_feat_dict(texts, clip, tokenizer, text_feat_d={}):\n",
    "    for t in texts:\n",
    "        feat = clip.get_text_features(t, tokenizer, text_feat_d)\n",
    "        text_feat_d[t] = feat\n",
    "    return text_feat_d\n",
    "\n",
    "def get_vid_feat(frames, clip):\n",
    "    return clip.get_vid_features(frames)\n",
    "\n",
    "v_mean = np.array([0.485, 0.456, 0.406]).reshape(1,1,3)\n",
    "v_std = np.array([0.229, 0.224, 0.225]).reshape(1,1,3)\n",
    "def normalize(data):\n",
    "    return (data/255.0-v_mean)/v_std\n",
    "\n",
    "def frames2tensor(vid_list, fnum=8, target_size=(224, 224), device=torch.device('cuda')):\n",
    "    assert(len(vid_list) >= fnum)\n",
    "    step = len(vid_list) // fnum\n",
    "    vid_list = vid_list[::step][:fnum]\n",
    "    vid_list = [cv2.resize(x[:,:,::-1], target_size) for x in vid_list]\n",
    "    vid_tube = [np.expand_dims(normalize(x), axis=(0, 1)) for x in vid_list]\n",
    "    vid_tube = np.concatenate(vid_tube, axis=1)\n",
    "    vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3))\n",
    "    vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float()\n",
    "    return vid_tube\n",
    "def retrieve_text(frames, \n",
    "                  texts, \n",
    "                  models={'viclip':None, \n",
    "                          'tokenizer':None},\n",
    "                  topk=5, \n",
    "                  device=torch.device('cuda')):\n",
    "    # clip, tokenizer = get_clip(name, model_cfg['size'], model_cfg['pretrained'], model_cfg['reload'])\n",
    "    assert(type(models)==dict and models['viclip'] is not None and models['tokenizer'] is not None)\n",
    "    clip, tokenizer = models['viclip'], models['tokenizer']\n",
    "    clip = clip.to(device)\n",
    "    frames_tensor = frames2tensor(frames, device=device)\n",
    "    vid_feat = get_vid_feat(frames_tensor, clip)\n",
    "\n",
    "    text_feat_d = {}\n",
    "    text_feat_d = get_text_feat_dict(texts, clip, tokenizer, text_feat_d)\n",
    "    text_feats = [text_feat_d[t] for t in texts]\n",
    "    text_feats_tensor = torch.cat(text_feats, 0)\n",
    "    \n",
    "    probs, idxs = clip.get_predict_label(vid_feat, text_feats_tensor, top=topk)\n",
    "\n",
    "    ret_texts = [texts[i] for i in idxs.numpy()[0].tolist()]\n",
    "    return ret_texts, probs.numpy()[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2969ba6-19d0-4893-b071-b82fa046c312",
   "metadata": {},
   "outputs": [],
   "source": [
    "# retrieval\n",
    "text_candidates = [\"A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.\",\n",
    "                   \"A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.\",\n",
    "                   \"A person dressed in a blue jacket shovels the snow-covered pavement outside their house.\",\n",
    "                   \"A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.\",\n",
    "                   \"A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.\",\n",
    "                   \"A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.\",\n",
    "                   \"A playful dog slides down a snowy hill, wagging its tail with delight.\",\n",
    "                   \"A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.\",\n",
    "                   \"A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.\",\n",
    "                   \"A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery.\"]\n",
    "texts, probs = retrieve_text(frames, text_candidates, models=model_tokenizer, topk=5)\n",
    "\n",
    "for t, p in zip(texts, probs):\n",
    "    print(f'text: {t} ~ prob: {p:.4f}')\n",
    "    \n",
    "# text: A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run. ~ prob: 0.8333\n",
    "# text: A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon. ~ prob: 0.1266\n",
    "# text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.0368\n",
    "# text: A person dressed in a blue jacket shovels the snow-covered pavement outside their house. ~ prob: 0.0030\n",
    "# text: A playful dog slides down a snowy hill, wagging its tail with delight. ~ prob: 0.0003"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84922de7-b41c-41c1-87a0-b28e52da9b5d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}