{ "cells": [ { "cell_type": "markdown", "id": "f86bc499", "metadata": {}, "source": [ "## download ViCILP weights and put its pth file in viclip folder. " ] }, { "cell_type": "code", "execution_count": 1, "id": "e7a90379-d9ee-45d9-9073-7ed5132fa6b1", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/mnt/petrelfs/wangyi/.conda/envs/pt13/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import numpy as np\n", "import os\n", "import cv2\n", "\n", "from viclip import get_viclip, retrieve_text, _frame_from_video" ] }, { "cell_type": "code", "execution_count": 2, "id": "a425a5da-ceaf-4b89-9845-c8ba576902d8", "metadata": {}, "outputs": [], "source": [ "video = cv2.VideoCapture('example1.mp4')\n", "frames = [x for x in _frame_from_video(video)]" ] }, { "cell_type": "code", "execution_count": 3, "id": "e6c1cd7a", "metadata": {}, "outputs": [], "source": [ "# modify xxx to the path of the pretrained model\n", "model_cfgs = {\n", " 'viclip-l-internvid-10m-flt': {\n", " 'size': 'l',\n", " 'pretrained': 'xxx/ViCLIP-L_InternVid-FLT-10M.pth',\n", " },\n", " 'viclip-l-internvid-200m': {\n", " 'size': 'l',\n", " 'pretrained': 'xxx/ViCLIP-L_InternVid-200M.pth',\n", " },\n", " 'viclip-b-internvid-10m-flt': {\n", " 'size': 'b',\n", " 'pretrained': 'xxx/ViCLIP-B_InternVid-FLT-10M.pth',\n", " },\n", " 'viclip-b-internvid-200m': {\n", " 'size': 'b',\n", " 'pretrained': 'xxx/ViCLIP-B_InternVid-200M.pth',\n", " },\n", "}" ] }, { "cell_type": "code", "execution_count": 5, "id": "3fb7397a-02ef-41b5-9ffe-f2363b277778", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/mnt/petrelfs/wangyi/.conda/envs/pt13/lib/python3.9/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n", " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "text: A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run. ~ prob: 0.8333\n", "text: A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon. ~ prob: 0.1266\n", "text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.0368\n", "text: A person dressed in a blue jacket shovels the snow-covered pavement outside their house. ~ prob: 0.0030\n", "text: A playful dog slides down a snowy hill, wagging its tail with delight. ~ prob: 0.0003\n" ] } ], "source": [ "text_candidates = [\"A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.\",\n", " \"A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.\",\n", " \"A person dressed in a blue jacket shovels the snow-covered pavement outside their house.\",\n", " \"A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.\",\n", " \"A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.\",\n", " \"A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.\",\n", " \"A playful dog slides down a snowy hill, wagging its tail with delight.\",\n", " \"A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.\",\n", " \"A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.\",\n", " \"A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery.\"]\n", "\n", "cfg = model_cfgs['viclip-l-internvid-10m-flt']\n", "model_l = get_viclip(cfg['size'], cfg['pretrained'])\n", "texts, probs = retrieve_text(frames, text_candidates, models=model_l, topk=5)\n", "\n", "for t, p in zip(texts, probs):\n", " print(f'text: {t} ~ prob: {p:.4f}')" ] }, { "cell_type": "code", "execution_count": 6, "id": "a2969ba6-19d0-4893-b071-b82fa046c312", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "text: A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon. ~ prob: 0.8192\n", "text: A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run. ~ prob: 0.1084\n", "text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.0676\n", "text: A playful dog slides down a snowy hill, wagging its tail with delight. ~ prob: 0.0047\n", "text: A person dressed in a blue jacket shovels the snow-covered pavement outside their house. ~ prob: 0.0002\n" ] } ], "source": [ "cfg = model_cfgs['viclip-b-internvid-10m-flt']\n", "model_b = get_viclip(cfg['size'], cfg['pretrained'])\n", "texts, probs = retrieve_text(frames, text_candidates, models=model_b, topk=5)\n", "\n", "for t, p in zip(texts, probs):\n", " print(f'text: {t} ~ prob: {p:.4f}')" ] }, { "cell_type": "code", "execution_count": null, "id": "ebdae1be-0dc4-4f3c-9856-5e0fd27aa368", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 5 }