{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/toolkit/.conda/envs/urlb_test/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n", "DropoutAddRMSNorm of flash_attn is not installed!!!\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[2024-04-16 22:03:29,983] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n" ] } ], "source": [ "import numpy as np\n", "import os\n", "import io\n", "import cv2\n", "\n", "import torch\n", "import sys\n", "sys.path.insert(0, '/home/toolkit/eai_urlb/InternVideo/InternVideo2/multi_modality/demo/')\n", "sys.path.insert(0, '/home/toolkit/eai_urlb/InternVideo/InternVideo2/multi_modality')\n", "\n", "from small_config import (Config, eval_dict_leaf)\n", "from small_utils import (retrieve_text, _frame_from_video, setup_internvideo2)" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "# video = cv2.VideoCapture('example1.mp4')\n", "video = cv2.VideoCapture('../../../../video_samples/person_walking_video.mp4')\n", "frames = [x for x in _frame_from_video(video)]" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "text_candidates = [\"A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.\",\n", " \"A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.\",\n", " \"A person dressed in a blue jacket shovels the snow-covered pavement outside their house.\",\n", " \"A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.\",\n", " \"A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.\",\n", " \"A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.\",\n", " \"A playful dog slides down a snowy hill, wagging its tail with delight.\",\n", " \"A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.\",\n", " \"A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.\",\n", " \"A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery.\",\n", " \"A person playing with a kid in the street\",\n", " \"A group of friends playing bowling.\",\n", " \"A japanese girl eating noodles\",\n", " \"A painting by Monet\",\n", " \"A person lying in bed\",\n", " \"A person lying down on the grass\",\n", " \"A person with a hat\",\n", " \"Playing with hat\",\n", " \"Somebody walking\",\n", " \"Fidget spinner\"]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "load_state_dict: _IncompatibleKeys(missing_keys=['text_encoder.embeddings.position_ids', 'text_encoder.embeddings.word_embeddings.weight', 'text_encoder.embeddings.position_embeddings.weight', 'text_encoder.embeddings.token_type_embeddings.weight', 'text_encoder.embeddings.LayerNorm.weight', 'text_encoder.embeddings.LayerNorm.bias', 'text_encoder.encoder.layer.0.attention.self.query.weight', 'text_encoder.encoder.layer.0.attention.self.query.bias', 'text_encoder.encoder.layer.0.attention.self.key.weight', 'text_encoder.encoder.layer.0.attention.self.key.bias', 'text_encoder.encoder.layer.0.attention.self.value.weight', 'text_encoder.encoder.layer.0.attention.self.value.bias', 'text_encoder.encoder.layer.0.attention.output.dense.weight', 'text_encoder.encoder.layer.0.attention.output.dense.bias', 'text_encoder.encoder.layer.0.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.0.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.0.intermediate.dense.weight', 'text_encoder.encoder.layer.0.intermediate.dense.bias', 'text_encoder.encoder.layer.0.output.dense.weight', 'text_encoder.encoder.layer.0.output.dense.bias', 'text_encoder.encoder.layer.0.output.LayerNorm.weight', 'text_encoder.encoder.layer.0.output.LayerNorm.bias', 'text_encoder.encoder.layer.1.attention.self.query.weight', 'text_encoder.encoder.layer.1.attention.self.query.bias', 'text_encoder.encoder.layer.1.attention.self.key.weight', 'text_encoder.encoder.layer.1.attention.self.key.bias', 'text_encoder.encoder.layer.1.attention.self.value.weight', 'text_encoder.encoder.layer.1.attention.self.value.bias', 'text_encoder.encoder.layer.1.attention.output.dense.weight', 'text_encoder.encoder.layer.1.attention.output.dense.bias', 'text_encoder.encoder.layer.1.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.1.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.1.intermediate.dense.weight', 'text_encoder.encoder.layer.1.intermediate.dense.bias', 'text_encoder.encoder.layer.1.output.dense.weight', 'text_encoder.encoder.layer.1.output.dense.bias', 'text_encoder.encoder.layer.1.output.LayerNorm.weight', 'text_encoder.encoder.layer.1.output.LayerNorm.bias', 'text_encoder.encoder.layer.2.attention.self.query.weight', 'text_encoder.encoder.layer.2.attention.self.query.bias', 'text_encoder.encoder.layer.2.attention.self.key.weight', 'text_encoder.encoder.layer.2.attention.self.key.bias', 'text_encoder.encoder.layer.2.attention.self.value.weight', 'text_encoder.encoder.layer.2.attention.self.value.bias', 'text_encoder.encoder.layer.2.attention.output.dense.weight', 'text_encoder.encoder.layer.2.attention.output.dense.bias', 'text_encoder.encoder.layer.2.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.2.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.2.intermediate.dense.weight', 'text_encoder.encoder.layer.2.intermediate.dense.bias', 'text_encoder.encoder.layer.2.output.dense.weight', 'text_encoder.encoder.layer.2.output.dense.bias', 'text_encoder.encoder.layer.2.output.LayerNorm.weight', 'text_encoder.encoder.layer.2.output.LayerNorm.bias', 'text_encoder.encoder.layer.3.attention.self.query.weight', 'text_encoder.encoder.layer.3.attention.self.query.bias', 'text_encoder.encoder.layer.3.attention.self.key.weight', 'text_encoder.encoder.layer.3.attention.self.key.bias', 'text_encoder.encoder.layer.3.attention.self.value.weight', 'text_encoder.encoder.layer.3.attention.self.value.bias', 'text_encoder.encoder.layer.3.attention.output.dense.weight', 'text_encoder.encoder.layer.3.attention.output.dense.bias', 'text_encoder.encoder.layer.3.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.3.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.3.intermediate.dense.weight', 'text_encoder.encoder.layer.3.intermediate.dense.bias', 'text_encoder.encoder.layer.3.output.dense.weight', 'text_encoder.encoder.layer.3.output.dense.bias', 'text_encoder.encoder.layer.3.output.LayerNorm.weight', 'text_encoder.encoder.layer.3.output.LayerNorm.bias', 'text_encoder.encoder.layer.4.attention.self.query.weight', 'text_encoder.encoder.layer.4.attention.self.query.bias', 'text_encoder.encoder.layer.4.attention.self.key.weight', 'text_encoder.encoder.layer.4.attention.self.key.bias', 'text_encoder.encoder.layer.4.attention.self.value.weight', 'text_encoder.encoder.layer.4.attention.self.value.bias', 'text_encoder.encoder.layer.4.attention.output.dense.weight', 'text_encoder.encoder.layer.4.attention.output.dense.bias', 'text_encoder.encoder.layer.4.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.4.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.4.intermediate.dense.weight', 'text_encoder.encoder.layer.4.intermediate.dense.bias', 'text_encoder.encoder.layer.4.output.dense.weight', 'text_encoder.encoder.layer.4.output.dense.bias', 'text_encoder.encoder.layer.4.output.LayerNorm.weight', 'text_encoder.encoder.layer.4.output.LayerNorm.bias', 'text_encoder.encoder.layer.5.attention.self.query.weight', 'text_encoder.encoder.layer.5.attention.self.query.bias', 'text_encoder.encoder.layer.5.attention.self.key.weight', 'text_encoder.encoder.layer.5.attention.self.key.bias', 'text_encoder.encoder.layer.5.attention.self.value.weight', 'text_encoder.encoder.layer.5.attention.self.value.bias', 'text_encoder.encoder.layer.5.attention.output.dense.weight', 'text_encoder.encoder.layer.5.attention.output.dense.bias', 'text_encoder.encoder.layer.5.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.5.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.5.intermediate.dense.weight', 'text_encoder.encoder.layer.5.intermediate.dense.bias', 'text_encoder.encoder.layer.5.output.dense.weight', 'text_encoder.encoder.layer.5.output.dense.bias', 'text_encoder.encoder.layer.5.output.LayerNorm.weight', 'text_encoder.encoder.layer.5.output.LayerNorm.bias', 'text_encoder.encoder.layer.6.attention.self.query.weight', 'text_encoder.encoder.layer.6.attention.self.query.bias', 'text_encoder.encoder.layer.6.attention.self.key.weight', 'text_encoder.encoder.layer.6.attention.self.key.bias', 'text_encoder.encoder.layer.6.attention.self.value.weight', 'text_encoder.encoder.layer.6.attention.self.value.bias', 'text_encoder.encoder.layer.6.attention.output.dense.weight', 'text_encoder.encoder.layer.6.attention.output.dense.bias', 'text_encoder.encoder.layer.6.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.6.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.6.intermediate.dense.weight', 'text_encoder.encoder.layer.6.intermediate.dense.bias', 'text_encoder.encoder.layer.6.output.dense.weight', 'text_encoder.encoder.layer.6.output.dense.bias', 'text_encoder.encoder.layer.6.output.LayerNorm.weight', 'text_encoder.encoder.layer.6.output.LayerNorm.bias', 'text_encoder.encoder.layer.7.attention.self.query.weight', 'text_encoder.encoder.layer.7.attention.self.query.bias', 'text_encoder.encoder.layer.7.attention.self.key.weight', 'text_encoder.encoder.layer.7.attention.self.key.bias', 'text_encoder.encoder.layer.7.attention.self.value.weight', 'text_encoder.encoder.layer.7.attention.self.value.bias', 'text_encoder.encoder.layer.7.attention.output.dense.weight', 'text_encoder.encoder.layer.7.attention.output.dense.bias', 'text_encoder.encoder.layer.7.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.7.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.7.intermediate.dense.weight', 'text_encoder.encoder.layer.7.intermediate.dense.bias', 'text_encoder.encoder.layer.7.output.dense.weight', 'text_encoder.encoder.layer.7.output.dense.bias', 'text_encoder.encoder.layer.7.output.LayerNorm.weight', 'text_encoder.encoder.layer.7.output.LayerNorm.bias', 'text_encoder.encoder.layer.8.attention.self.query.weight', 'text_encoder.encoder.layer.8.attention.self.query.bias', 'text_encoder.encoder.layer.8.attention.self.key.weight', 'text_encoder.encoder.layer.8.attention.self.key.bias', 'text_encoder.encoder.layer.8.attention.self.value.weight', 'text_encoder.encoder.layer.8.attention.self.value.bias', 'text_encoder.encoder.layer.8.attention.output.dense.weight', 'text_encoder.encoder.layer.8.attention.output.dense.bias', 'text_encoder.encoder.layer.8.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.8.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.8.intermediate.dense.weight', 'text_encoder.encoder.layer.8.intermediate.dense.bias', 'text_encoder.encoder.layer.8.output.dense.weight', 'text_encoder.encoder.layer.8.output.dense.bias', 'text_encoder.encoder.layer.8.output.LayerNorm.weight', 'text_encoder.encoder.layer.8.output.LayerNorm.bias', 'text_encoder.encoder.layer.9.attention.self.query.weight', 'text_encoder.encoder.layer.9.attention.self.query.bias', 'text_encoder.encoder.layer.9.attention.self.key.weight', 'text_encoder.encoder.layer.9.attention.self.key.bias', 'text_encoder.encoder.layer.9.attention.self.value.weight', 'text_encoder.encoder.layer.9.attention.self.value.bias', 'text_encoder.encoder.layer.9.attention.output.dense.weight', 'text_encoder.encoder.layer.9.attention.output.dense.bias', 'text_encoder.encoder.layer.9.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.9.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.9.intermediate.dense.weight', 'text_encoder.encoder.layer.9.intermediate.dense.bias', 'text_encoder.encoder.layer.9.output.dense.weight', 'text_encoder.encoder.layer.9.output.dense.bias', 'text_encoder.encoder.layer.9.output.LayerNorm.weight', 'text_encoder.encoder.layer.9.output.LayerNorm.bias', 'text_encoder.encoder.layer.10.attention.self.query.weight', 'text_encoder.encoder.layer.10.attention.self.query.bias', 'text_encoder.encoder.layer.10.attention.self.key.weight', 'text_encoder.encoder.layer.10.attention.self.key.bias', 'text_encoder.encoder.layer.10.attention.self.value.weight', 'text_encoder.encoder.layer.10.attention.self.value.bias', 'text_encoder.encoder.layer.10.attention.output.dense.weight', 'text_encoder.encoder.layer.10.attention.output.dense.bias', 'text_encoder.encoder.layer.10.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.10.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.10.intermediate.dense.weight', 'text_encoder.encoder.layer.10.intermediate.dense.bias', 'text_encoder.encoder.layer.10.output.dense.weight', 'text_encoder.encoder.layer.10.output.dense.bias', 'text_encoder.encoder.layer.10.output.LayerNorm.weight', 'text_encoder.encoder.layer.10.output.LayerNorm.bias', 'text_encoder.encoder.layer.11.attention.self.query.weight', 'text_encoder.encoder.layer.11.attention.self.query.bias', 'text_encoder.encoder.layer.11.attention.self.key.weight', 'text_encoder.encoder.layer.11.attention.self.key.bias', 'text_encoder.encoder.layer.11.attention.self.value.weight', 'text_encoder.encoder.layer.11.attention.self.value.bias', 'text_encoder.encoder.layer.11.attention.output.dense.weight', 'text_encoder.encoder.layer.11.attention.output.dense.bias', 'text_encoder.encoder.layer.11.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.11.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.11.intermediate.dense.weight', 'text_encoder.encoder.layer.11.intermediate.dense.bias', 'text_encoder.encoder.layer.11.output.dense.weight', 'text_encoder.encoder.layer.11.output.dense.bias', 'text_encoder.encoder.layer.11.output.LayerNorm.weight', 'text_encoder.encoder.layer.11.output.LayerNorm.bias', 'text_encoder.encoder.layer.12.attention.self.query.weight', 'text_encoder.encoder.layer.12.attention.self.query.bias', 'text_encoder.encoder.layer.12.attention.self.key.weight', 'text_encoder.encoder.layer.12.attention.self.key.bias', 'text_encoder.encoder.layer.12.attention.self.value.weight', 'text_encoder.encoder.layer.12.attention.self.value.bias', 'text_encoder.encoder.layer.12.attention.output.dense.weight', 'text_encoder.encoder.layer.12.attention.output.dense.bias', 'text_encoder.encoder.layer.12.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.12.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.12.intermediate.dense.weight', 'text_encoder.encoder.layer.12.intermediate.dense.bias', 'text_encoder.encoder.layer.12.output.dense.weight', 'text_encoder.encoder.layer.12.output.dense.bias', 'text_encoder.encoder.layer.12.output.LayerNorm.weight', 'text_encoder.encoder.layer.12.output.LayerNorm.bias', 'text_encoder.encoder.layer.13.attention.self.query.weight', 'text_encoder.encoder.layer.13.attention.self.query.bias', 'text_encoder.encoder.layer.13.attention.self.key.weight', 'text_encoder.encoder.layer.13.attention.self.key.bias', 'text_encoder.encoder.layer.13.attention.self.value.weight', 'text_encoder.encoder.layer.13.attention.self.value.bias', 'text_encoder.encoder.layer.13.attention.output.dense.weight', 'text_encoder.encoder.layer.13.attention.output.dense.bias', 'text_encoder.encoder.layer.13.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.13.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.13.intermediate.dense.weight', 'text_encoder.encoder.layer.13.intermediate.dense.bias', 'text_encoder.encoder.layer.13.output.dense.weight', 'text_encoder.encoder.layer.13.output.dense.bias', 'text_encoder.encoder.layer.13.output.LayerNorm.weight', 'text_encoder.encoder.layer.13.output.LayerNorm.bias', 'text_encoder.encoder.layer.14.attention.self.query.weight', 'text_encoder.encoder.layer.14.attention.self.query.bias', 'text_encoder.encoder.layer.14.attention.self.key.weight', 'text_encoder.encoder.layer.14.attention.self.key.bias', 'text_encoder.encoder.layer.14.attention.self.value.weight', 'text_encoder.encoder.layer.14.attention.self.value.bias', 'text_encoder.encoder.layer.14.attention.output.dense.weight', 'text_encoder.encoder.layer.14.attention.output.dense.bias', 'text_encoder.encoder.layer.14.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.14.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.14.intermediate.dense.weight', 'text_encoder.encoder.layer.14.intermediate.dense.bias', 'text_encoder.encoder.layer.14.output.dense.weight', 'text_encoder.encoder.layer.14.output.dense.bias', 'text_encoder.encoder.layer.14.output.LayerNorm.weight', 'text_encoder.encoder.layer.14.output.LayerNorm.bias', 'text_encoder.encoder.layer.15.attention.self.query.weight', 'text_encoder.encoder.layer.15.attention.self.query.bias', 'text_encoder.encoder.layer.15.attention.self.key.weight', 'text_encoder.encoder.layer.15.attention.self.key.bias', 'text_encoder.encoder.layer.15.attention.self.value.weight', 'text_encoder.encoder.layer.15.attention.self.value.bias', 'text_encoder.encoder.layer.15.attention.output.dense.weight', 'text_encoder.encoder.layer.15.attention.output.dense.bias', 'text_encoder.encoder.layer.15.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.15.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.15.intermediate.dense.weight', 'text_encoder.encoder.layer.15.intermediate.dense.bias', 'text_encoder.encoder.layer.15.output.dense.weight', 'text_encoder.encoder.layer.15.output.dense.bias', 'text_encoder.encoder.layer.15.output.LayerNorm.weight', 'text_encoder.encoder.layer.15.output.LayerNorm.bias', 'text_encoder.encoder.layer.16.attention.self.query.weight', 'text_encoder.encoder.layer.16.attention.self.query.bias', 'text_encoder.encoder.layer.16.attention.self.key.weight', 'text_encoder.encoder.layer.16.attention.self.key.bias', 'text_encoder.encoder.layer.16.attention.self.value.weight', 'text_encoder.encoder.layer.16.attention.self.value.bias', 'text_encoder.encoder.layer.16.attention.output.dense.weight', 'text_encoder.encoder.layer.16.attention.output.dense.bias', 'text_encoder.encoder.layer.16.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.16.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.16.intermediate.dense.weight', 'text_encoder.encoder.layer.16.intermediate.dense.bias', 'text_encoder.encoder.layer.16.output.dense.weight', 'text_encoder.encoder.layer.16.output.dense.bias', 'text_encoder.encoder.layer.16.output.LayerNorm.weight', 'text_encoder.encoder.layer.16.output.LayerNorm.bias', 'text_encoder.encoder.layer.17.attention.self.query.weight', 'text_encoder.encoder.layer.17.attention.self.query.bias', 'text_encoder.encoder.layer.17.attention.self.key.weight', 'text_encoder.encoder.layer.17.attention.self.key.bias', 'text_encoder.encoder.layer.17.attention.self.value.weight', 'text_encoder.encoder.layer.17.attention.self.value.bias', 'text_encoder.encoder.layer.17.attention.output.dense.weight', 'text_encoder.encoder.layer.17.attention.output.dense.bias', 'text_encoder.encoder.layer.17.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.17.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.17.intermediate.dense.weight', 'text_encoder.encoder.layer.17.intermediate.dense.bias', 'text_encoder.encoder.layer.17.output.dense.weight', 'text_encoder.encoder.layer.17.output.dense.bias', 'text_encoder.encoder.layer.17.output.LayerNorm.weight', 'text_encoder.encoder.layer.17.output.LayerNorm.bias', 'text_encoder.encoder.layer.18.attention.self.query.weight', 'text_encoder.encoder.layer.18.attention.self.query.bias', 'text_encoder.encoder.layer.18.attention.self.key.weight', 'text_encoder.encoder.layer.18.attention.self.key.bias', 'text_encoder.encoder.layer.18.attention.self.value.weight', 'text_encoder.encoder.layer.18.attention.self.value.bias', 'text_encoder.encoder.layer.18.attention.output.dense.weight', 'text_encoder.encoder.layer.18.attention.output.dense.bias', 'text_encoder.encoder.layer.18.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.18.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.18.intermediate.dense.weight', 'text_encoder.encoder.layer.18.intermediate.dense.bias', 'text_encoder.encoder.layer.18.output.dense.weight', 'text_encoder.encoder.layer.18.output.dense.bias', 'text_encoder.encoder.layer.18.output.LayerNorm.weight', 'text_encoder.encoder.layer.18.output.LayerNorm.bias', 'text_encoder.encoder.layer.19.attention.self.query.weight', 'text_encoder.encoder.layer.19.attention.self.query.bias', 'text_encoder.encoder.layer.19.attention.self.key.weight', 'text_encoder.encoder.layer.19.attention.self.key.bias', 'text_encoder.encoder.layer.19.attention.self.value.weight', 'text_encoder.encoder.layer.19.attention.self.value.bias', 'text_encoder.encoder.layer.19.attention.output.dense.weight', 'text_encoder.encoder.layer.19.attention.output.dense.bias', 'text_encoder.encoder.layer.19.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.19.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.19.crossattention.self.query.weight', 'text_encoder.encoder.layer.19.crossattention.self.query.bias', 'text_encoder.encoder.layer.19.crossattention.self.key.weight', 'text_encoder.encoder.layer.19.crossattention.self.key.bias', 'text_encoder.encoder.layer.19.crossattention.self.value.weight', 'text_encoder.encoder.layer.19.crossattention.self.value.bias', 'text_encoder.encoder.layer.19.crossattention.output.dense.weight', 'text_encoder.encoder.layer.19.crossattention.output.dense.bias', 'text_encoder.encoder.layer.19.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.19.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.19.intermediate.dense.weight', 'text_encoder.encoder.layer.19.intermediate.dense.bias', 'text_encoder.encoder.layer.19.output.dense.weight', 'text_encoder.encoder.layer.19.output.dense.bias', 'text_encoder.encoder.layer.19.output.LayerNorm.weight', 'text_encoder.encoder.layer.19.output.LayerNorm.bias', 'text_encoder.encoder.layer.20.attention.self.query.weight', 'text_encoder.encoder.layer.20.attention.self.query.bias', 'text_encoder.encoder.layer.20.attention.self.key.weight', 'text_encoder.encoder.layer.20.attention.self.key.bias', 'text_encoder.encoder.layer.20.attention.self.value.weight', 'text_encoder.encoder.layer.20.attention.self.value.bias', 'text_encoder.encoder.layer.20.attention.output.dense.weight', 'text_encoder.encoder.layer.20.attention.output.dense.bias', 'text_encoder.encoder.layer.20.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.20.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.20.crossattention.self.query.weight', 'text_encoder.encoder.layer.20.crossattention.self.query.bias', 'text_encoder.encoder.layer.20.crossattention.self.key.weight', 'text_encoder.encoder.layer.20.crossattention.self.key.bias', 'text_encoder.encoder.layer.20.crossattention.self.value.weight', 'text_encoder.encoder.layer.20.crossattention.self.value.bias', 'text_encoder.encoder.layer.20.crossattention.output.dense.weight', 'text_encoder.encoder.layer.20.crossattention.output.dense.bias', 'text_encoder.encoder.layer.20.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.20.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.20.intermediate.dense.weight', 'text_encoder.encoder.layer.20.intermediate.dense.bias', 'text_encoder.encoder.layer.20.output.dense.weight', 'text_encoder.encoder.layer.20.output.dense.bias', 'text_encoder.encoder.layer.20.output.LayerNorm.weight', 'text_encoder.encoder.layer.20.output.LayerNorm.bias', 'text_encoder.encoder.layer.21.attention.self.query.weight', 'text_encoder.encoder.layer.21.attention.self.query.bias', 'text_encoder.encoder.layer.21.attention.self.key.weight', 'text_encoder.encoder.layer.21.attention.self.key.bias', 'text_encoder.encoder.layer.21.attention.self.value.weight', 'text_encoder.encoder.layer.21.attention.self.value.bias', 'text_encoder.encoder.layer.21.attention.output.dense.weight', 'text_encoder.encoder.layer.21.attention.output.dense.bias', 'text_encoder.encoder.layer.21.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.21.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.21.crossattention.self.query.weight', 'text_encoder.encoder.layer.21.crossattention.self.query.bias', 'text_encoder.encoder.layer.21.crossattention.self.key.weight', 'text_encoder.encoder.layer.21.crossattention.self.key.bias', 'text_encoder.encoder.layer.21.crossattention.self.value.weight', 'text_encoder.encoder.layer.21.crossattention.self.value.bias', 'text_encoder.encoder.layer.21.crossattention.output.dense.weight', 'text_encoder.encoder.layer.21.crossattention.output.dense.bias', 'text_encoder.encoder.layer.21.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.21.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.21.intermediate.dense.weight', 'text_encoder.encoder.layer.21.intermediate.dense.bias', 'text_encoder.encoder.layer.21.output.dense.weight', 'text_encoder.encoder.layer.21.output.dense.bias', 'text_encoder.encoder.layer.21.output.LayerNorm.weight', 'text_encoder.encoder.layer.21.output.LayerNorm.bias', 'text_encoder.encoder.layer.22.attention.self.query.weight', 'text_encoder.encoder.layer.22.attention.self.query.bias', 'text_encoder.encoder.layer.22.attention.self.key.weight', 'text_encoder.encoder.layer.22.attention.self.key.bias', 'text_encoder.encoder.layer.22.attention.self.value.weight', 'text_encoder.encoder.layer.22.attention.self.value.bias', 'text_encoder.encoder.layer.22.attention.output.dense.weight', 'text_encoder.encoder.layer.22.attention.output.dense.bias', 'text_encoder.encoder.layer.22.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.22.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.22.crossattention.self.query.weight', 'text_encoder.encoder.layer.22.crossattention.self.query.bias', 'text_encoder.encoder.layer.22.crossattention.self.key.weight', 'text_encoder.encoder.layer.22.crossattention.self.key.bias', 'text_encoder.encoder.layer.22.crossattention.self.value.weight', 'text_encoder.encoder.layer.22.crossattention.self.value.bias', 'text_encoder.encoder.layer.22.crossattention.output.dense.weight', 'text_encoder.encoder.layer.22.crossattention.output.dense.bias', 'text_encoder.encoder.layer.22.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.22.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.22.intermediate.dense.weight', 'text_encoder.encoder.layer.22.intermediate.dense.bias', 'text_encoder.encoder.layer.22.output.dense.weight', 'text_encoder.encoder.layer.22.output.dense.bias', 'text_encoder.encoder.layer.22.output.LayerNorm.weight', 'text_encoder.encoder.layer.22.output.LayerNorm.bias', 'text_encoder.encoder.layer.23.attention.self.query.weight', 'text_encoder.encoder.layer.23.attention.self.query.bias', 'text_encoder.encoder.layer.23.attention.self.key.weight', 'text_encoder.encoder.layer.23.attention.self.key.bias', 'text_encoder.encoder.layer.23.attention.self.value.weight', 'text_encoder.encoder.layer.23.attention.self.value.bias', 'text_encoder.encoder.layer.23.attention.output.dense.weight', 'text_encoder.encoder.layer.23.attention.output.dense.bias', 'text_encoder.encoder.layer.23.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.23.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.23.crossattention.self.query.weight', 'text_encoder.encoder.layer.23.crossattention.self.query.bias', 'text_encoder.encoder.layer.23.crossattention.self.key.weight', 'text_encoder.encoder.layer.23.crossattention.self.key.bias', 'text_encoder.encoder.layer.23.crossattention.self.value.weight', 'text_encoder.encoder.layer.23.crossattention.self.value.bias', 'text_encoder.encoder.layer.23.crossattention.output.dense.weight', 'text_encoder.encoder.layer.23.crossattention.output.dense.bias', 'text_encoder.encoder.layer.23.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.23.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.23.intermediate.dense.weight', 'text_encoder.encoder.layer.23.intermediate.dense.bias', 'text_encoder.encoder.layer.23.output.dense.weight', 'text_encoder.encoder.layer.23.output.dense.bias', 'text_encoder.encoder.layer.23.output.LayerNorm.weight', 'text_encoder.encoder.layer.23.output.LayerNorm.bias'], unexpected_keys=['temp', 'itm_head.weight', 'itm_head.bias', 'text_encoder.bert.embeddings.position_ids', 'text_encoder.bert.embeddings.word_embeddings.weight', 'text_encoder.bert.embeddings.position_embeddings.weight', 'text_encoder.bert.embeddings.token_type_embeddings.weight', 'text_encoder.bert.embeddings.LayerNorm.weight', 'text_encoder.bert.embeddings.LayerNorm.bias', 'text_encoder.bert.encoder.layer.0.attention.self.query.weight', 'text_encoder.bert.encoder.layer.0.attention.self.query.bias', 'text_encoder.bert.encoder.layer.0.attention.self.key.weight', 'text_encoder.bert.encoder.layer.0.attention.self.key.bias', 'text_encoder.bert.encoder.layer.0.attention.self.value.weight', 'text_encoder.bert.encoder.layer.0.attention.self.value.bias', 'text_encoder.bert.encoder.layer.0.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.0.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.0.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.0.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.0.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.0.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.0.output.dense.weight', 'text_encoder.bert.encoder.layer.0.output.dense.bias', 'text_encoder.bert.encoder.layer.0.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.0.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.1.attention.self.query.weight', 'text_encoder.bert.encoder.layer.1.attention.self.query.bias', 'text_encoder.bert.encoder.layer.1.attention.self.key.weight', 'text_encoder.bert.encoder.layer.1.attention.self.key.bias', 'text_encoder.bert.encoder.layer.1.attention.self.value.weight', 'text_encoder.bert.encoder.layer.1.attention.self.value.bias', 'text_encoder.bert.encoder.layer.1.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.1.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.1.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.1.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.1.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.1.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.1.output.dense.weight', 'text_encoder.bert.encoder.layer.1.output.dense.bias', 'text_encoder.bert.encoder.layer.1.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.1.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.2.attention.self.query.weight', 'text_encoder.bert.encoder.layer.2.attention.self.query.bias', 'text_encoder.bert.encoder.layer.2.attention.self.key.weight', 'text_encoder.bert.encoder.layer.2.attention.self.key.bias', 'text_encoder.bert.encoder.layer.2.attention.self.value.weight', 'text_encoder.bert.encoder.layer.2.attention.self.value.bias', 'text_encoder.bert.encoder.layer.2.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.2.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.2.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.2.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.2.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.2.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.2.output.dense.weight', 'text_encoder.bert.encoder.layer.2.output.dense.bias', 'text_encoder.bert.encoder.layer.2.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.2.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.3.attention.self.query.weight', 'text_encoder.bert.encoder.layer.3.attention.self.query.bias', 'text_encoder.bert.encoder.layer.3.attention.self.key.weight', 'text_encoder.bert.encoder.layer.3.attention.self.key.bias', 'text_encoder.bert.encoder.layer.3.attention.self.value.weight', 'text_encoder.bert.encoder.layer.3.attention.self.value.bias', 'text_encoder.bert.encoder.layer.3.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.3.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.3.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.3.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.3.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.3.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.3.output.dense.weight', 'text_encoder.bert.encoder.layer.3.output.dense.bias', 'text_encoder.bert.encoder.layer.3.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.3.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.4.attention.self.query.weight', 'text_encoder.bert.encoder.layer.4.attention.self.query.bias', 'text_encoder.bert.encoder.layer.4.attention.self.key.weight', 'text_encoder.bert.encoder.layer.4.attention.self.key.bias', 'text_encoder.bert.encoder.layer.4.attention.self.value.weight', 'text_encoder.bert.encoder.layer.4.attention.self.value.bias', 'text_encoder.bert.encoder.layer.4.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.4.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.4.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.4.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.4.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.4.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.4.output.dense.weight', 'text_encoder.bert.encoder.layer.4.output.dense.bias', 'text_encoder.bert.encoder.layer.4.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.4.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.5.attention.self.query.weight', 'text_encoder.bert.encoder.layer.5.attention.self.query.bias', 'text_encoder.bert.encoder.layer.5.attention.self.key.weight', 'text_encoder.bert.encoder.layer.5.attention.self.key.bias', 'text_encoder.bert.encoder.layer.5.attention.self.value.weight', 'text_encoder.bert.encoder.layer.5.attention.self.value.bias', 'text_encoder.bert.encoder.layer.5.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.5.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.5.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.5.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.5.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.5.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.5.output.dense.weight', 'text_encoder.bert.encoder.layer.5.output.dense.bias', 'text_encoder.bert.encoder.layer.5.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.5.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.6.attention.self.query.weight', 'text_encoder.bert.encoder.layer.6.attention.self.query.bias', 'text_encoder.bert.encoder.layer.6.attention.self.key.weight', 'text_encoder.bert.encoder.layer.6.attention.self.key.bias', 'text_encoder.bert.encoder.layer.6.attention.self.value.weight', 'text_encoder.bert.encoder.layer.6.attention.self.value.bias', 'text_encoder.bert.encoder.layer.6.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.6.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.6.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.6.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.6.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.6.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.6.output.dense.weight', 'text_encoder.bert.encoder.layer.6.output.dense.bias', 'text_encoder.bert.encoder.layer.6.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.6.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.7.attention.self.query.weight', 'text_encoder.bert.encoder.layer.7.attention.self.query.bias', 'text_encoder.bert.encoder.layer.7.attention.self.key.weight', 'text_encoder.bert.encoder.layer.7.attention.self.key.bias', 'text_encoder.bert.encoder.layer.7.attention.self.value.weight', 'text_encoder.bert.encoder.layer.7.attention.self.value.bias', 'text_encoder.bert.encoder.layer.7.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.7.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.7.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.7.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.7.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.7.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.7.output.dense.weight', 'text_encoder.bert.encoder.layer.7.output.dense.bias', 'text_encoder.bert.encoder.layer.7.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.7.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.8.attention.self.query.weight', 'text_encoder.bert.encoder.layer.8.attention.self.query.bias', 'text_encoder.bert.encoder.layer.8.attention.self.key.weight', 'text_encoder.bert.encoder.layer.8.attention.self.key.bias', 'text_encoder.bert.encoder.layer.8.attention.self.value.weight', 'text_encoder.bert.encoder.layer.8.attention.self.value.bias', 'text_encoder.bert.encoder.layer.8.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.8.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.8.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.8.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.8.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.8.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.8.output.dense.weight', 'text_encoder.bert.encoder.layer.8.output.dense.bias', 'text_encoder.bert.encoder.layer.8.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.8.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.9.attention.self.query.weight', 'text_encoder.bert.encoder.layer.9.attention.self.query.bias', 'text_encoder.bert.encoder.layer.9.attention.self.key.weight', 'text_encoder.bert.encoder.layer.9.attention.self.key.bias', 'text_encoder.bert.encoder.layer.9.attention.self.value.weight', 'text_encoder.bert.encoder.layer.9.attention.self.value.bias', 'text_encoder.bert.encoder.layer.9.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.9.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.9.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.9.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.9.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.9.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.9.output.dense.weight', 'text_encoder.bert.encoder.layer.9.output.dense.bias', 'text_encoder.bert.encoder.layer.9.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.9.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.10.attention.self.query.weight', 'text_encoder.bert.encoder.layer.10.attention.self.query.bias', 'text_encoder.bert.encoder.layer.10.attention.self.key.weight', 'text_encoder.bert.encoder.layer.10.attention.self.key.bias', 'text_encoder.bert.encoder.layer.10.attention.self.value.weight', 'text_encoder.bert.encoder.layer.10.attention.self.value.bias', 'text_encoder.bert.encoder.layer.10.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.10.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.10.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.10.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.10.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.10.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.10.output.dense.weight', 'text_encoder.bert.encoder.layer.10.output.dense.bias', 'text_encoder.bert.encoder.layer.10.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.10.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.11.attention.self.query.weight', 'text_encoder.bert.encoder.layer.11.attention.self.query.bias', 'text_encoder.bert.encoder.layer.11.attention.self.key.weight', 'text_encoder.bert.encoder.layer.11.attention.self.key.bias', 'text_encoder.bert.encoder.layer.11.attention.self.value.weight', 'text_encoder.bert.encoder.layer.11.attention.self.value.bias', 'text_encoder.bert.encoder.layer.11.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.11.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.11.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.11.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.11.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.11.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.11.output.dense.weight', 'text_encoder.bert.encoder.layer.11.output.dense.bias', 'text_encoder.bert.encoder.layer.11.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.11.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.12.attention.self.query.weight', 'text_encoder.bert.encoder.layer.12.attention.self.query.bias', 'text_encoder.bert.encoder.layer.12.attention.self.key.weight', 'text_encoder.bert.encoder.layer.12.attention.self.key.bias', 'text_encoder.bert.encoder.layer.12.attention.self.value.weight', 'text_encoder.bert.encoder.layer.12.attention.self.value.bias', 'text_encoder.bert.encoder.layer.12.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.12.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.12.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.12.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.12.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.12.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.12.output.dense.weight', 'text_encoder.bert.encoder.layer.12.output.dense.bias', 'text_encoder.bert.encoder.layer.12.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.12.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.13.attention.self.query.weight', 'text_encoder.bert.encoder.layer.13.attention.self.query.bias', 'text_encoder.bert.encoder.layer.13.attention.self.key.weight', 'text_encoder.bert.encoder.layer.13.attention.self.key.bias', 'text_encoder.bert.encoder.layer.13.attention.self.value.weight', 'text_encoder.bert.encoder.layer.13.attention.self.value.bias', 'text_encoder.bert.encoder.layer.13.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.13.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.13.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.13.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.13.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.13.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.13.output.dense.weight', 'text_encoder.bert.encoder.layer.13.output.dense.bias', 'text_encoder.bert.encoder.layer.13.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.13.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.14.attention.self.query.weight', 'text_encoder.bert.encoder.layer.14.attention.self.query.bias', 'text_encoder.bert.encoder.layer.14.attention.self.key.weight', 'text_encoder.bert.encoder.layer.14.attention.self.key.bias', 'text_encoder.bert.encoder.layer.14.attention.self.value.weight', 'text_encoder.bert.encoder.layer.14.attention.self.value.bias', 'text_encoder.bert.encoder.layer.14.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.14.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.14.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.14.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.14.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.14.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.14.output.dense.weight', 'text_encoder.bert.encoder.layer.14.output.dense.bias', 'text_encoder.bert.encoder.layer.14.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.14.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.15.attention.self.query.weight', 'text_encoder.bert.encoder.layer.15.attention.self.query.bias', 'text_encoder.bert.encoder.layer.15.attention.self.key.weight', 'text_encoder.bert.encoder.layer.15.attention.self.key.bias', 'text_encoder.bert.encoder.layer.15.attention.self.value.weight', 'text_encoder.bert.encoder.layer.15.attention.self.value.bias', 'text_encoder.bert.encoder.layer.15.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.15.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.15.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.15.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.15.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.15.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.15.output.dense.weight', 'text_encoder.bert.encoder.layer.15.output.dense.bias', 'text_encoder.bert.encoder.layer.15.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.15.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.16.attention.self.query.weight', 'text_encoder.bert.encoder.layer.16.attention.self.query.bias', 'text_encoder.bert.encoder.layer.16.attention.self.key.weight', 'text_encoder.bert.encoder.layer.16.attention.self.key.bias', 'text_encoder.bert.encoder.layer.16.attention.self.value.weight', 'text_encoder.bert.encoder.layer.16.attention.self.value.bias', 'text_encoder.bert.encoder.layer.16.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.16.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.16.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.16.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.16.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.16.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.16.output.dense.weight', 'text_encoder.bert.encoder.layer.16.output.dense.bias', 'text_encoder.bert.encoder.layer.16.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.16.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.17.attention.self.query.weight', 'text_encoder.bert.encoder.layer.17.attention.self.query.bias', 'text_encoder.bert.encoder.layer.17.attention.self.key.weight', 'text_encoder.bert.encoder.layer.17.attention.self.key.bias', 'text_encoder.bert.encoder.layer.17.attention.self.value.weight', 'text_encoder.bert.encoder.layer.17.attention.self.value.bias', 'text_encoder.bert.encoder.layer.17.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.17.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.17.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.17.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.17.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.17.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.17.output.dense.weight', 'text_encoder.bert.encoder.layer.17.output.dense.bias', 'text_encoder.bert.encoder.layer.17.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.17.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.18.attention.self.query.weight', 'text_encoder.bert.encoder.layer.18.attention.self.query.bias', 'text_encoder.bert.encoder.layer.18.attention.self.key.weight', 'text_encoder.bert.encoder.layer.18.attention.self.key.bias', 'text_encoder.bert.encoder.layer.18.attention.self.value.weight', 'text_encoder.bert.encoder.layer.18.attention.self.value.bias', 'text_encoder.bert.encoder.layer.18.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.18.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.18.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.18.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.18.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.18.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.18.output.dense.weight', 'text_encoder.bert.encoder.layer.18.output.dense.bias', 'text_encoder.bert.encoder.layer.18.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.18.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.19.attention.self.query.weight', 'text_encoder.bert.encoder.layer.19.attention.self.query.bias', 'text_encoder.bert.encoder.layer.19.attention.self.key.weight', 'text_encoder.bert.encoder.layer.19.attention.self.key.bias', 'text_encoder.bert.encoder.layer.19.attention.self.value.weight', 'text_encoder.bert.encoder.layer.19.attention.self.value.bias', 'text_encoder.bert.encoder.layer.19.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.19.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.19.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.19.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.19.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.19.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.19.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.19.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.19.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.19.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.19.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.19.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.19.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.19.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.19.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.19.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.19.output.dense.weight', 'text_encoder.bert.encoder.layer.19.output.dense.bias', 'text_encoder.bert.encoder.layer.19.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.19.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.20.attention.self.query.weight', 'text_encoder.bert.encoder.layer.20.attention.self.query.bias', 'text_encoder.bert.encoder.layer.20.attention.self.key.weight', 'text_encoder.bert.encoder.layer.20.attention.self.key.bias', 'text_encoder.bert.encoder.layer.20.attention.self.value.weight', 'text_encoder.bert.encoder.layer.20.attention.self.value.bias', 'text_encoder.bert.encoder.layer.20.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.20.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.20.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.20.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.20.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.20.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.20.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.20.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.20.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.20.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.20.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.20.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.20.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.20.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.20.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.20.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.20.output.dense.weight', 'text_encoder.bert.encoder.layer.20.output.dense.bias', 'text_encoder.bert.encoder.layer.20.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.20.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.21.attention.self.query.weight', 'text_encoder.bert.encoder.layer.21.attention.self.query.bias', 'text_encoder.bert.encoder.layer.21.attention.self.key.weight', 'text_encoder.bert.encoder.layer.21.attention.self.key.bias', 'text_encoder.bert.encoder.layer.21.attention.self.value.weight', 'text_encoder.bert.encoder.layer.21.attention.self.value.bias', 'text_encoder.bert.encoder.layer.21.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.21.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.21.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.21.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.21.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.21.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.21.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.21.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.21.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.21.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.21.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.21.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.21.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.21.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.21.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.21.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.21.output.dense.weight', 'text_encoder.bert.encoder.layer.21.output.dense.bias', 'text_encoder.bert.encoder.layer.21.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.21.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.22.attention.self.query.weight', 'text_encoder.bert.encoder.layer.22.attention.self.query.bias', 'text_encoder.bert.encoder.layer.22.attention.self.key.weight', 'text_encoder.bert.encoder.layer.22.attention.self.key.bias', 'text_encoder.bert.encoder.layer.22.attention.self.value.weight', 'text_encoder.bert.encoder.layer.22.attention.self.value.bias', 'text_encoder.bert.encoder.layer.22.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.22.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.22.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.22.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.22.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.22.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.22.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.22.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.22.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.22.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.22.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.22.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.22.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.22.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.22.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.22.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.22.output.dense.weight', 'text_encoder.bert.encoder.layer.22.output.dense.bias', 'text_encoder.bert.encoder.layer.22.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.22.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.23.attention.self.query.weight', 'text_encoder.bert.encoder.layer.23.attention.self.query.bias', 'text_encoder.bert.encoder.layer.23.attention.self.key.weight', 'text_encoder.bert.encoder.layer.23.attention.self.key.bias', 'text_encoder.bert.encoder.layer.23.attention.self.value.weight', 'text_encoder.bert.encoder.layer.23.attention.self.value.bias', 'text_encoder.bert.encoder.layer.23.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.23.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.23.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.23.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.23.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.23.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.23.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.23.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.23.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.23.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.23.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.23.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.23.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.23.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.23.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.23.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.23.output.dense.weight', 'text_encoder.bert.encoder.layer.23.output.dense.bias', 'text_encoder.bert.encoder.layer.23.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.23.output.LayerNorm.bias', 'text_encoder.cls.predictions.bias', 'text_encoder.cls.predictions.transform.dense.weight', 'text_encoder.cls.predictions.transform.dense.bias', 'text_encoder.cls.predictions.transform.LayerNorm.weight', 'text_encoder.cls.predictions.transform.LayerNorm.bias', 'text_encoder.cls.predictions.decoder.weight', 'text_encoder.cls.predictions.decoder.bias'])\n" ] } ], "source": [ "if 'intern_model' in locals():\n", " del intern_model\n", " del tokenizer\n", "config = Config.from_file('/home/toolkit/eai_urlb/InternVideo/InternVideo2/multi_modality/demo/internvideo2_stage2_config.py')\n", "config = eval_dict_leaf(config)\n", "config.model.vision_encoder.num_frames = 8\n", "config.num_frames = 8\n", "config.num_frames_test = 8\n", "config.model.text_encoder.pretrained = '/home/toolkit/.cache/huggingface/hub/models--bert-large-uncased/snapshots/6da4b6a26a1877e173fca3225479512db81a5e5b/'\n", "config.model.text_encoder.config = '/home/toolkit/eai_urlb/InternVideo/InternVideo2/multi_modality/' + config.model.text_encoder.config\n", "model_pth = '/home/toolkit/eai_urlb/InternVideo/InternVideo2/download_models/InternVideo2-stage2_1b-224p-f4.pt'\n", "config.pretrained_path = model_pth\n", "config['model']['vision_encoder']['pretrained'] = model_pth\n", "intern_model, tokenizer = setup_internvideo2(config) " ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Video tensor([0.0030], device='cuda:0')\n", "Text tensor([-0.0008, -0.0001, -0.0013, -0.0014, 0.0005, -0.0004, -0.0004, -0.0006,\n", " 0.0001, -0.0003, 0.0003, 0.0012, -0.0004, 0.0007, -0.0014, -0.0017,\n", " -0.0007, -0.0018, -0.0006, -0.0024], device='cuda:0')\n", "text: Somebody walking ~ prob: 0.6945\n", "text: Playing with hat ~ prob: 0.1198\n", "text: A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride. ~ prob: 0.0297\n", "text: A person with a hat ~ prob: 0.0245\n", "text: A person dressed in a blue jacket shovels the snow-covered pavement outside their house. ~ prob: 0.0226\n", "text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.0222\n", "text: A group of friends playing bowling. ~ prob: 0.0212\n", "text: A person lying in bed ~ prob: 0.0208\n", "text: A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery. ~ prob: 0.0186\n", "text: A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees. ~ prob: 0.0102\n", "text: A person playing with a kid in the street ~ prob: 0.0045\n", "text: A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys. ~ prob: 0.0025\n", "text: A playful dog slides down a snowy hill, wagging its tail with delight. ~ prob: 0.0024\n", "text: A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees. ~ prob: 0.0015\n" ] }, { "ename": "", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", "\u001b[1;31mClick here for more info. \n", "\u001b[1;31mView Jupyter log for further details." ] } ], "source": [ "intern_model.eval()\n", "texts, probs = retrieve_text(frames, text_candidates, model=intern_model, topk=14, config=config)\n", "\n", "# Video tensor([0.0023], device='cuda:0')\n", "# Text tensor([-0.0008, -0.0001, -0.0013, -0.0014, 0.0005, -0.0004, -0.0004, -0.0006,\n", "# 0.0001, -0.0003, 0.0003, 0.0012, -0.0004, 0.0007, -0.0014, -0.0017,\n", "# -0.0007, -0.0018, -0.0006], device='cuda:0')\n", "# text: A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery. ~ prob: 0.4592\n", "# text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.1335\n", "# text: A japanese girl eating noodles ~ prob: 0.1089\n", "\n", "for t, p in zip(texts, probs):\n", " print(f'text: {t} ~ prob: {p:.4f}')" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Parameter containing:\n", "tensor([[[[[-4.7913e-03, -2.1515e-03, -2.0447e-03, ..., 3.2997e-04,\n", " -3.0212e-03, -7.9727e-04],\n", " [ 9.7656e-04, 2.4567e-03, 9.8419e-04, ..., -1.8845e-03,\n", " 2.3193e-03, 3.6621e-03],\n", " [-3.5095e-04, 2.5940e-03, -2.7618e-03, ..., -3.7956e-04,\n", " -3.1948e-05, 1.7166e-03],\n", " ...,\n", " [-3.8605e-03, -1.1215e-03, -9.0790e-04, ..., 6.5994e-04,\n", " 1.0071e-03, 1.2894e-03],\n", " [-2.2278e-03, 1.2589e-03, -1.0204e-04, ..., 3.7079e-03,\n", " 1.5354e-04, -8.3160e-04],\n", " [ 6.3324e-04, 1.4114e-03, 9.5367e-04, ..., -3.4485e-03,\n", " -1.8234e-03, -4.0283e-03]]],\n", "\n", "\n", " [[[ 7.0190e-04, -1.3657e-03, -6.5994e-04, ..., 1.4725e-03,\n", " -8.5831e-04, 1.6212e-04],\n", " [ 1.7262e-04, 8.0872e-04, 4.1485e-05, ..., -6.4850e-04,\n", " 5.5695e-04, 1.7242e-03],\n", " [ 1.3504e-03, 3.2959e-03, -1.3275e-03, ..., 2.2736e-03,\n", " 4.2725e-04, 1.9150e-03],\n", " ...,\n", " [-2.3041e-03, -6.4850e-04, -2.8839e-03, ..., 2.9755e-04,\n", " -3.0518e-04, 1.2817e-03],\n", " [ 9.3079e-04, -1.2512e-03, -1.5335e-03, ..., 1.9455e-03,\n", " -3.4142e-04, -1.2054e-03],\n", " [ 9.1553e-03, 3.6774e-03, 2.2125e-03, ..., -5.3883e-05,\n", " 3.2234e-04, 2.3499e-03]]],\n", "\n", "\n", " [[[ 2.0752e-03, 7.4768e-04, 2.6512e-04, ..., 2.3193e-03,\n", " -3.3379e-04, -9.2983e-05],\n", " [ 1.4725e-03, 1.0986e-03, -8.8692e-05, ..., -2.8229e-04,\n", " 7.2098e-04, -2.2888e-03],\n", " [ 1.3809e-03, 1.5945e-03, 6.5231e-04, ..., 3.3112e-03,\n", " 2.1515e-03, -1.4114e-03],\n", " ...,\n", " [-1.2512e-03, 1.0605e-03, 5.6744e-05, ..., -4.7112e-04,\n", " -3.4714e-04, -1.6861e-03],\n", " [-5.4550e-04, 1.1978e-03, 1.9531e-03, ..., 7.6675e-04,\n", " -1.9150e-03, -1.6937e-03],\n", " [-4.5776e-03, -3.0212e-03, -1.4648e-03, ..., -1.0757e-03,\n", " 1.0061e-04, 2.9449e-03]]]],\n", "\n", "\n", "\n", " [[[[-9.9487e-03, -5.9814e-03, 3.9673e-03, ..., 7.8125e-03,\n", " 4.5776e-03, -4.7607e-03],\n", " [-7.5989e-03, 2.5940e-04, -6.0730e-03, ..., -1.4725e-03,\n", " -3.8300e-03, -2.4567e-03],\n", " [ 6.9427e-04, 3.1090e-04, -2.1515e-03, ..., -1.2779e-04,\n", " -6.0120e-03, -1.4191e-03],\n", " ...,\n", " [ 1.1597e-02, -8.3447e-05, -1.3428e-03, ..., -4.4556e-03,\n", " -4.4823e-04, -1.6861e-03],\n", " [ 4.3640e-03, -2.0447e-03, -1.3123e-03, ..., -4.4556e-03,\n", " -4.0283e-03, -4.6387e-03],\n", " [ 9.2163e-03, -5.1880e-03, 1.3351e-03, ..., 4.7112e-04,\n", " 2.6550e-03, 4.9744e-03]]],\n", "\n", "\n", " [[[ 4.3030e-03, -6.3171e-03, -1.2436e-03, ..., 2.1210e-03,\n", " -9.4250e-07, -1.0559e-02],\n", " [-1.2436e-03, -4.1504e-03, -9.7046e-03, ..., -2.8687e-03,\n", " -6.9885e-03, -9.7046e-03],\n", " [ 6.2561e-04, -5.7678e-03, -3.9978e-03, ..., -1.9989e-03,\n", " -4.5166e-03, -5.5542e-03],\n", " ...,\n", " [ 7.6904e-03, -2.9144e-03, -2.0905e-03, ..., -3.9368e-03,\n", " 2.1515e-03, -3.9062e-03],\n", " [-6.9427e-04, -2.9907e-03, -1.2512e-03, ..., 1.6785e-03,\n", " 5.8594e-03, -2.0294e-03],\n", " [ 7.8678e-05, -6.0730e-03, 1.0834e-03, ..., 2.9564e-04,\n", " 3.1738e-03, -8.4839e-03]]],\n", "\n", "\n", " [[[ 4.2419e-03, -7.5073e-03, -2.8381e-03, ..., -7.7515e-03,\n", " -6.6223e-03, 2.1667e-03],\n", " [ 8.1787e-03, 5.8899e-03, 1.0376e-03, ..., -1.8463e-03,\n", " -3.1281e-03, 5.8899e-03],\n", " [ 1.7776e-03, -4.2915e-04, 8.6975e-04, ..., -4.2915e-05,\n", " -3.2043e-03, 9.5825e-03],\n", " ...,\n", " [ 2.7847e-04, -1.9989e-03, -5.2490e-03, ..., -5.7068e-03,\n", " -4.5776e-04, -3.5095e-03],\n", " [-5.3406e-03, -6.9427e-04, -4.9133e-03, ..., -1.0910e-03,\n", " -6.4468e-04, -5.1880e-03],\n", " [-7.3853e-03, -2.1210e-03, 4.7302e-03, ..., 2.0752e-03,\n", " -2.0447e-03, -1.2329e-02]]]],\n", "\n", "\n", "\n", " [[[[ 4.4861e-03, -2.8992e-03, -4.7302e-03, ..., -5.3406e-03,\n", " -4.6692e-03, -4.6387e-03],\n", " [ 1.8921e-03, -5.6458e-03, -3.7079e-03, ..., -2.5482e-03,\n", " -4.8218e-03, 2.1515e-03],\n", " [ 4.5471e-03, 2.9755e-04, -3.7842e-03, ..., 3.6774e-03,\n", " -2.6550e-03, -1.8845e-03],\n", " ...,\n", " [ 7.2098e-04, 3.1281e-03, 2.0027e-04, ..., 2.7924e-03,\n", " 1.0986e-03, 3.4943e-03],\n", " [ 1.4496e-03, -2.8229e-04, 7.0801e-03, ..., 1.0071e-03,\n", " -3.9978e-03, 3.7689e-03],\n", " [ 9.9945e-04, 7.3624e-04, 9.7046e-03, ..., 3.9673e-03,\n", " 6.7139e-03, 1.1414e-02]]],\n", "\n", "\n", " [[[-3.5400e-03, -6.9809e-04, 3.9673e-03, ..., 7.1716e-04,\n", " 2.3651e-03, 1.6098e-03],\n", " [-1.7319e-03, 8.0109e-04, 2.7466e-03, ..., -1.7262e-04,\n", " -1.6937e-03, 6.1340e-03],\n", " [-3.9978e-03, 2.0599e-03, -2.4414e-03, ..., 2.2888e-03,\n", " 2.2736e-03, 4.1809e-03],\n", " ...,\n", " [-6.6223e-03, -1.0529e-03, -3.0823e-03, ..., 1.2894e-03,\n", " 1.7624e-03, -6.0425e-03],\n", " [ 7.5531e-04, -2.0599e-03, 2.0142e-03, ..., 3.3569e-03,\n", " 1.8215e-04, -7.1411e-03],\n", " [ 5.5237e-03, 2.3842e-04, 7.2937e-03, ..., -4.1809e-03,\n", " -4.4861e-03, -1.7700e-02]]],\n", "\n", "\n", " [[[-7.1716e-04, -1.5488e-03, -2.5635e-03, ..., 1.8692e-03,\n", " 5.4016e-03, 3.8300e-03],\n", " [ 1.9531e-03, -9.1934e-04, 2.0981e-05, ..., -5.3024e-04,\n", " -1.9989e-03, 1.1778e-04],\n", " [ 2.5635e-03, 4.4556e-03, -3.9978e-03, ..., 1.7548e-03,\n", " 1.4114e-04, -1.2817e-03],\n", " ...,\n", " [-1.9226e-03, -4.1389e-04, -4.2114e-03, ..., 8.2016e-04,\n", " 5.0964e-03, 2.5330e-03],\n", " [ 8.4229e-03, 1.8539e-03, 1.4038e-03, ..., 2.4109e-03,\n", " 1.8616e-03, -1.0300e-03],\n", " [-8.6784e-05, -7.3547e-03, -1.5182e-03, ..., 1.5335e-03,\n", " 2.2736e-03, -8.4839e-03]]]],\n", "\n", "\n", "\n", " ...,\n", "\n", "\n", "\n", " [[[[-1.2207e-02, 4.0771e-02, -2.9419e-02, ..., 6.5918e-02,\n", " -2.6978e-02, 3.0640e-02],\n", " [ 7.8125e-02, 4.8828e-02, -5.3955e-02, ..., 4.3945e-02,\n", " -3.3447e-02, 3.4424e-02],\n", " [-1.3489e-02, 7.2021e-03, -5.0293e-02, ..., -1.9043e-02,\n", " -4.4189e-02, -3.7354e-02],\n", " ...,\n", " [-3.1250e-02, -1.1047e-02, 5.7617e-02, ..., -1.9287e-02,\n", " 6.2500e-02, -6.5308e-03],\n", " [-3.1738e-02, 5.6152e-03, -1.0986e-02, ..., -5.6763e-03,\n", " 2.3804e-02, 6.2500e-02],\n", " [-3.5400e-02, -4.4861e-03, 3.7109e-02, ..., 3.3691e-02,\n", " -7.1777e-02, 9.3750e-02]]],\n", "\n", "\n", " [[[-1.5332e-01, 3.3203e-02, -7.8125e-02, ..., 6.9824e-02,\n", " -1.1902e-02, 6.1340e-03],\n", " [ 5.1117e-04, 2.8809e-02, -7.9102e-02, ..., 5.4932e-02,\n", " -8.6670e-03, 2.2827e-02],\n", " [-7.3853e-03, 6.6528e-03, -4.9561e-02, ..., 5.6076e-04,\n", " -3.0029e-02, -1.6724e-02],\n", " ...,\n", " [ 6.7871e-02, 5.0049e-02, 7.0801e-02, ..., -6.1646e-03,\n", " 7.9102e-02, -4.5410e-02],\n", " [ 3.5706e-03, 6.5308e-03, -9.0942e-03, ..., -2.1210e-03,\n", " 4.9561e-02, 4.6143e-02],\n", " [ 5.6152e-02, 1.8311e-02, 4.5898e-02, ..., 2.8564e-02,\n", " -9.3262e-02, -4.9316e-02]]],\n", "\n", "\n", " [[[-5.1270e-02, 9.4727e-02, 2.1973e-02, ..., 4.9072e-02,\n", " -6.0547e-02, 6.9580e-03],\n", " [-1.7578e-02, 1.5869e-02, -5.0293e-02, ..., 9.4604e-03,\n", " -6.9336e-02, 7.2098e-04],\n", " [-6.4453e-02, -1.0620e-02, -7.0801e-02, ..., 3.5156e-02,\n", " -4.1016e-02, -3.4912e-02],\n", " ...,\n", " [-5.2185e-03, 3.0640e-02, 7.5195e-02, ..., 4.1260e-02,\n", " 8.6426e-02, -2.6367e-02],\n", " [ 2.8076e-03, 5.4626e-03, 2.0874e-02, ..., 1.0452e-03,\n", " -1.2207e-02, 2.1973e-03],\n", " [-7.7148e-02, -5.0781e-02, 3.3936e-02, ..., 1.7334e-02,\n", " -1.2988e-01, -5.0781e-02]]]],\n", "\n", "\n", "\n", " [[[[-5.6885e-02, -1.1035e-01, -2.1118e-02, ..., 5.2002e-02,\n", " -7.9346e-03, -5.3711e-02],\n", " [ 2.9053e-02, 1.7944e-02, -1.0315e-02, ..., 3.6621e-02,\n", " 3.3936e-02, -1.4587e-02],\n", " [ 1.5259e-04, -2.6245e-02, -9.5703e-02, ..., 9.5825e-03,\n", " 6.3965e-02, 2.9907e-02],\n", " ...,\n", " [ 3.8818e-02, 2.9907e-02, 2.7710e-02, ..., -1.0938e-01,\n", " -4.6387e-02, 5.1575e-03],\n", " [-1.7212e-02, 1.6235e-02, -6.0547e-02, ..., -2.7710e-02,\n", " -5.9204e-03, 2.5024e-02],\n", " [-4.1504e-02, 1.3794e-02, -8.2520e-02, ..., 4.7852e-02,\n", " 8.2520e-02, -9.4238e-02]]],\n", "\n", "\n", " [[[ 3.7842e-02, -9.7656e-02, -4.6143e-02, ..., 2.5635e-02,\n", " -1.6479e-02, 1.9531e-03],\n", " [ 7.7148e-02, 4.0771e-02, -3.4027e-03, ..., 1.4648e-02,\n", " -2.0142e-02, -8.6670e-03],\n", " [ 6.1035e-02, 6.5308e-03, -2.5635e-02, ..., 5.0049e-02,\n", " 4.8828e-03, -2.4780e-02],\n", " ...,\n", " [ 9.9487e-03, -6.6528e-03, -1.1353e-02, ..., -1.1572e-01,\n", " 1.9043e-02, 5.9082e-02],\n", " [ 3.8086e-02, 4.2725e-02, -4.7363e-02, ..., 7.5989e-03,\n", " 6.4087e-03, 7.3853e-03],\n", " [-7.2632e-03, 9.0820e-02, -5.2979e-02, ..., 4.4678e-02,\n", " 6.6895e-02, -1.0303e-01]]],\n", "\n", "\n", " [[[ 1.4526e-02, -1.1475e-01, -3.8818e-02, ..., 8.1055e-02,\n", " 3.8818e-02, 2.1118e-02],\n", " [ 1.2354e-01, 5.1025e-02, 2.1973e-03, ..., 1.8677e-02,\n", " 1.5991e-02, -3.3203e-02],\n", " [ 7.7148e-02, 6.9336e-02, -3.7842e-02, ..., -3.0151e-02,\n", " 3.9062e-02, -2.2217e-02],\n", " ...,\n", " [ 4.3701e-02, 8.2520e-02, 1.0156e-01, ..., -8.5449e-02,\n", " -3.0060e-03, 1.0547e-01],\n", " [ 3.0640e-02, 8.0078e-02, 2.1606e-02, ..., -2.0264e-02,\n", " 1.9287e-02, 7.8613e-02],\n", " [ 4.4678e-02, 9.7168e-02, -4.9561e-02, ..., 3.6377e-02,\n", " 1.3477e-01, -2.7771e-03]]]],\n", "\n", "\n", "\n", " [[[[-2.0996e-01, -4.6387e-02, -5.6458e-03, ..., 1.7334e-02,\n", " 4.6082e-03, -1.4038e-02],\n", " [-2.8931e-02, 2.0020e-02, -8.7891e-03, ..., -8.2520e-02,\n", " -5.2002e-02, -1.5869e-03],\n", " [ 7.1289e-02, 4.3335e-03, 1.1047e-02, ..., -7.5684e-03,\n", " -1.7456e-02, 1.5137e-02],\n", " ...,\n", " [ 5.0781e-02, 4.3213e-02, -8.1055e-02, ..., -3.9062e-02,\n", " -1.0693e-01, -2.9175e-02],\n", " [ 6.2500e-02, -3.9062e-03, -7.8735e-03, ..., -3.6377e-02,\n", " -4.8340e-02, 4.8340e-02],\n", " [-8.4473e-02, -3.3447e-02, -6.7383e-02, ..., 4.0527e-02,\n", " -6.9885e-03, 1.0547e-01]]],\n", "\n", "\n", " [[[-2.2266e-01, -4.6143e-02, -2.9541e-02, ..., 6.8054e-03,\n", " 3.4180e-02, -2.3682e-02],\n", " [-2.4170e-02, 5.7129e-02, 4.0771e-02, ..., -4.5898e-02,\n", " -1.1536e-02, 9.0942e-03],\n", " [ 7.8613e-02, 4.6631e-02, 9.9182e-04, ..., 4.2236e-02,\n", " 2.5879e-02, 4.2236e-02],\n", " ...,\n", " [ 5.7129e-02, 3.1250e-02, -7.7148e-02, ..., -2.1362e-02,\n", " -2.8809e-02, -6.3171e-03],\n", " [ 1.0352e-01, 6.2988e-02, -1.6602e-02, ..., -3.4668e-02,\n", " -5.9128e-04, 6.2988e-02],\n", " [-1.0193e-02, 5.0537e-02, -1.6479e-02, ..., 3.1738e-02,\n", " -4.3945e-02, -2.5146e-02]]],\n", "\n", "\n", " [[[ 5.5176e-02, -2.3804e-02, -2.0020e-02, ..., -5.1270e-03,\n", " -5.8899e-03, -2.4414e-02],\n", " [ 5.4199e-02, 4.0894e-03, 8.1787e-03, ..., -1.5320e-02,\n", " 1.3885e-03, 3.7842e-02],\n", " [ 1.0938e-01, 5.5847e-03, -1.3184e-02, ..., 4.1260e-02,\n", " 2.1484e-02, 5.2734e-02],\n", " ...,\n", " [-4.3297e-04, -1.1169e-02, -8.5449e-02, ..., 2.0264e-02,\n", " -4.7363e-02, -3.6774e-03],\n", " [ 5.5847e-03, -1.7212e-02, -4.9805e-02, ..., -7.6660e-02,\n", " -2.7466e-02, 4.8340e-02],\n", " [-9.3750e-02, -3.6377e-02, -6.6895e-02, ..., -3.0029e-02,\n", " -5.8350e-02, -6.1768e-02]]]]], device='cuda:0')" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Parameter containing:\n", "tensor([[[[[-4.7913e-03, -2.1515e-03, -2.0447e-03, ..., 3.2997e-04,\n", " -3.0212e-03, -7.9727e-04],\n", " [ 9.7656e-04, 2.4567e-03, 9.8419e-04, ..., -1.8845e-03,\n", " 2.3193e-03, 3.6621e-03],\n", " [-3.5095e-04, 2.5940e-03, -2.7618e-03, ..., -3.7956e-04,\n", " -3.1948e-05, 1.7166e-03],\n", " ...,\n", " [-3.8605e-03, -1.1215e-03, -9.0790e-04, ..., 6.5994e-04,\n", " 1.0071e-03, 1.2894e-03],\n", " [-2.2278e-03, 1.2589e-03, -1.0204e-04, ..., 3.7079e-03,\n", " 1.5354e-04, -8.3160e-04],\n", " [ 6.3324e-04, 1.4114e-03, 9.5367e-04, ..., -3.4485e-03,\n", " -1.8234e-03, -4.0283e-03]]],\n", "\"\"\"\n", "intern_model.vision_encoder.patch_embed.proj.weight\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Parameter containing:\n", "tensor([[ 0.0366, 0.0135, 0.0492, ..., -0.0274, 0.0493, 0.0242],\n", " [-0.0113, 0.0504, 0.0469, ..., -0.0269, -0.0224, -0.0305],\n", " [ 0.0192, -0.0152, 0.0119, ..., 0.0115, -0.0182, -0.0063],\n", " ...,\n", " [-0.0370, -0.0460, 0.0203, ..., 0.0157, -0.0529, 0.0139],\n", " [-0.0523, -0.0192, -0.0612, ..., -0.0515, 0.0169, 0.0098],\n", " [ 0.0277, -0.0029, -0.0349, ..., 0.0014, -0.0453, 0.0052]],\n", " device='cuda:0')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\"\"\n", "Parameter containing:\n", "tensor([[ 0.0366, 0.0135, 0.0492, ..., -0.0274, 0.0493, 0.0242],\n", " [-0.0113, 0.0504, 0.0469, ..., -0.0269, -0.0224, -0.0305],\n", " [ 0.0192, -0.0152, 0.0119, ..., 0.0115, -0.0182, -0.0063],\n", " ...,\n", " [-0.0370, -0.0460, 0.0203, ..., 0.0157, -0.0529, 0.0139],\n", " [-0.0523, -0.0192, -0.0612, ..., -0.0515, 0.0169, 0.0098],\n", " [ 0.0277, -0.0029, -0.0349, ..., 0.0014, -0.0453, 0.0052]],\n", " device='cuda:0')\n", "\"\"\"\n", "intern_model.text_encoder.encoder.layer[0].output.dense.weight" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Video tensor([0.0005], device='cuda:0')\n", "# Text tensor([-1.1985e-03, 5.7084e-04, -7.3242e-05, -2.1923e-04, 1.3280e-03,\n", "# 6.7617e-05, -5.6482e-04, 1.3007e-03, \n", "# 9.1326e-04, 5.7684e-04],\n", "# device='cuda:0')\n", "# text: A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees. ~ prob: 0.5572\n", "# text: A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys. ~ prob: 0.1044\n", "# text: A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon. ~ prob: 0.0958\n", "# text: A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride. ~ prob: 0.0936\n", "# text: A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run. ~ prob: 0.0404" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# /home/toolkit/.conda/envs/urlb_test/lib/python3.8/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n", "# warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n", "# Video tensor([-0.0014], device='cuda:0')\n", "# Text tensor([-1.8553e-03, -1.8098e-03, 5.9901e-04, -1.9457e-03, 4.7702e-05,\n", "# -2.8283e-03, -2.2676e-03, 7.7966e-04, -2.1556e-04, -3.8074e-04],\n", "# device='cuda:0')\n", "# text: A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride. ~ prob: 0.3186\n", "# text: A playful dog slides down a snowy hill, wagging its tail with delight. ~ prob: 0.1871\n", "# text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.1405\n", "# text: A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys. ~ prob: 0.1344\n", "# text: A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run. ~ prob: 0.0955" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.18" } }, "nbformat": 4, "nbformat_minor": 4 }