Spaces:
Sleeping
Sleeping
File size: 83,949 Bytes
2d9a728 |
|
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/toolkit/.conda/envs/urlb_test/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n",
"DropoutAddRMSNorm of flash_attn is not installed!!!\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2024-04-16 22:03:29,983] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
]
}
],
"source": [
"import numpy as np\n",
"import os\n",
"import io\n",
"import cv2\n",
"\n",
"import torch\n",
"import sys\n",
"sys.path.insert(0, '/home/toolkit/eai_urlb/InternVideo/InternVideo2/multi_modality/demo/')\n",
"sys.path.insert(0, '/home/toolkit/eai_urlb/InternVideo/InternVideo2/multi_modality')\n",
"\n",
"from small_config import (Config, eval_dict_leaf)\n",
"from small_utils import (retrieve_text, _frame_from_video, setup_internvideo2)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"# video = cv2.VideoCapture('example1.mp4')\n",
"video = cv2.VideoCapture('../../../../video_samples/person_walking_video.mp4')\n",
"frames = [x for x in _frame_from_video(video)]"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"text_candidates = [\"A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.\",\n",
" \"A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.\",\n",
" \"A person dressed in a blue jacket shovels the snow-covered pavement outside their house.\",\n",
" \"A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.\",\n",
" \"A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.\",\n",
" \"A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.\",\n",
" \"A playful dog slides down a snowy hill, wagging its tail with delight.\",\n",
" \"A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.\",\n",
" \"A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.\",\n",
" \"A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery.\",\n",
" \"A person playing with a kid in the street\",\n",
" \"A group of friends playing bowling.\",\n",
" \"A japanese girl eating noodles\",\n",
" \"A painting by Monet\",\n",
" \"A person lying in bed\",\n",
" \"A person lying down on the grass\",\n",
" \"A person with a hat\",\n",
" \"Playing with hat\",\n",
" \"Somebody walking\",\n",
" \"Fidget spinner\"]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"load_state_dict: _IncompatibleKeys(missing_keys=['text_encoder.embeddings.position_ids', 'text_encoder.embeddings.word_embeddings.weight', 'text_encoder.embeddings.position_embeddings.weight', 'text_encoder.embeddings.token_type_embeddings.weight', 'text_encoder.embeddings.LayerNorm.weight', 'text_encoder.embeddings.LayerNorm.bias', 'text_encoder.encoder.layer.0.attention.self.query.weight', 'text_encoder.encoder.layer.0.attention.self.query.bias', 'text_encoder.encoder.layer.0.attention.self.key.weight', 'text_encoder.encoder.layer.0.attention.self.key.bias', 'text_encoder.encoder.layer.0.attention.self.value.weight', 'text_encoder.encoder.layer.0.attention.self.value.bias', 'text_encoder.encoder.layer.0.attention.output.dense.weight', 'text_encoder.encoder.layer.0.attention.output.dense.bias', 'text_encoder.encoder.layer.0.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.0.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.0.intermediate.dense.weight', 'text_encoder.encoder.layer.0.intermediate.dense.bias', 'text_encoder.encoder.layer.0.output.dense.weight', 'text_encoder.encoder.layer.0.output.dense.bias', 'text_encoder.encoder.layer.0.output.LayerNorm.weight', 'text_encoder.encoder.layer.0.output.LayerNorm.bias', 'text_encoder.encoder.layer.1.attention.self.query.weight', 'text_encoder.encoder.layer.1.attention.self.query.bias', 'text_encoder.encoder.layer.1.attention.self.key.weight', 'text_encoder.encoder.layer.1.attention.self.key.bias', 'text_encoder.encoder.layer.1.attention.self.value.weight', 'text_encoder.encoder.layer.1.attention.self.value.bias', 'text_encoder.encoder.layer.1.attention.output.dense.weight', 'text_encoder.encoder.layer.1.attention.output.dense.bias', 'text_encoder.encoder.layer.1.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.1.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.1.intermediate.dense.weight', 'text_encoder.encoder.layer.1.intermediate.dense.bias', 'text_encoder.encoder.layer.1.output.dense.weight', 'text_encoder.encoder.layer.1.output.dense.bias', 'text_encoder.encoder.layer.1.output.LayerNorm.weight', 'text_encoder.encoder.layer.1.output.LayerNorm.bias', 'text_encoder.encoder.layer.2.attention.self.query.weight', 'text_encoder.encoder.layer.2.attention.self.query.bias', 'text_encoder.encoder.layer.2.attention.self.key.weight', 'text_encoder.encoder.layer.2.attention.self.key.bias', 'text_encoder.encoder.layer.2.attention.self.value.weight', 'text_encoder.encoder.layer.2.attention.self.value.bias', 'text_encoder.encoder.layer.2.attention.output.dense.weight', 'text_encoder.encoder.layer.2.attention.output.dense.bias', 'text_encoder.encoder.layer.2.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.2.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.2.intermediate.dense.weight', 'text_encoder.encoder.layer.2.intermediate.dense.bias', 'text_encoder.encoder.layer.2.output.dense.weight', 'text_encoder.encoder.layer.2.output.dense.bias', 'text_encoder.encoder.layer.2.output.LayerNorm.weight', 'text_encoder.encoder.layer.2.output.LayerNorm.bias', 'text_encoder.encoder.layer.3.attention.self.query.weight', 'text_encoder.encoder.layer.3.attention.self.query.bias', 'text_encoder.encoder.layer.3.attention.self.key.weight', 'text_encoder.encoder.layer.3.attention.self.key.bias', 'text_encoder.encoder.layer.3.attention.self.value.weight', 'text_encoder.encoder.layer.3.attention.self.value.bias', 'text_encoder.encoder.layer.3.attention.output.dense.weight', 'text_encoder.encoder.layer.3.attention.output.dense.bias', 'text_encoder.encoder.layer.3.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.3.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.3.intermediate.dense.weight', 'text_encoder.encoder.layer.3.intermediate.dense.bias', 'text_encoder.encoder.layer.3.output.dense.weight', 'text_encoder.encoder.layer.3.output.dense.bias', 'text_encoder.encoder.layer.3.output.LayerNorm.weight', 'text_encoder.encoder.layer.3.output.LayerNorm.bias', 'text_encoder.encoder.layer.4.attention.self.query.weight', 'text_encoder.encoder.layer.4.attention.self.query.bias', 'text_encoder.encoder.layer.4.attention.self.key.weight', 'text_encoder.encoder.layer.4.attention.self.key.bias', 'text_encoder.encoder.layer.4.attention.self.value.weight', 'text_encoder.encoder.layer.4.attention.self.value.bias', 'text_encoder.encoder.layer.4.attention.output.dense.weight', 'text_encoder.encoder.layer.4.attention.output.dense.bias', 'text_encoder.encoder.layer.4.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.4.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.4.intermediate.dense.weight', 'text_encoder.encoder.layer.4.intermediate.dense.bias', 'text_encoder.encoder.layer.4.output.dense.weight', 'text_encoder.encoder.layer.4.output.dense.bias', 'text_encoder.encoder.layer.4.output.LayerNorm.weight', 'text_encoder.encoder.layer.4.output.LayerNorm.bias', 'text_encoder.encoder.layer.5.attention.self.query.weight', 'text_encoder.encoder.layer.5.attention.self.query.bias', 'text_encoder.encoder.layer.5.attention.self.key.weight', 'text_encoder.encoder.layer.5.attention.self.key.bias', 'text_encoder.encoder.layer.5.attention.self.value.weight', 'text_encoder.encoder.layer.5.attention.self.value.bias', 'text_encoder.encoder.layer.5.attention.output.dense.weight', 'text_encoder.encoder.layer.5.attention.output.dense.bias', 'text_encoder.encoder.layer.5.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.5.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.5.intermediate.dense.weight', 'text_encoder.encoder.layer.5.intermediate.dense.bias', 'text_encoder.encoder.layer.5.output.dense.weight', 'text_encoder.encoder.layer.5.output.dense.bias', 'text_encoder.encoder.layer.5.output.LayerNorm.weight', 'text_encoder.encoder.layer.5.output.LayerNorm.bias', 'text_encoder.encoder.layer.6.attention.self.query.weight', 'text_encoder.encoder.layer.6.attention.self.query.bias', 'text_encoder.encoder.layer.6.attention.self.key.weight', 'text_encoder.encoder.layer.6.attention.self.key.bias', 'text_encoder.encoder.layer.6.attention.self.value.weight', 'text_encoder.encoder.layer.6.attention.self.value.bias', 'text_encoder.encoder.layer.6.attention.output.dense.weight', 'text_encoder.encoder.layer.6.attention.output.dense.bias', 'text_encoder.encoder.layer.6.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.6.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.6.intermediate.dense.weight', 'text_encoder.encoder.layer.6.intermediate.dense.bias', 'text_encoder.encoder.layer.6.output.dense.weight', 'text_encoder.encoder.layer.6.output.dense.bias', 'text_encoder.encoder.layer.6.output.LayerNorm.weight', 'text_encoder.encoder.layer.6.output.LayerNorm.bias', 'text_encoder.encoder.layer.7.attention.self.query.weight', 'text_encoder.encoder.layer.7.attention.self.query.bias', 'text_encoder.encoder.layer.7.attention.self.key.weight', 'text_encoder.encoder.layer.7.attention.self.key.bias', 'text_encoder.encoder.layer.7.attention.self.value.weight', 'text_encoder.encoder.layer.7.attention.self.value.bias', 'text_encoder.encoder.layer.7.attention.output.dense.weight', 'text_encoder.encoder.layer.7.attention.output.dense.bias', 'text_encoder.encoder.layer.7.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.7.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.7.intermediate.dense.weight', 'text_encoder.encoder.layer.7.intermediate.dense.bias', 'text_encoder.encoder.layer.7.output.dense.weight', 'text_encoder.encoder.layer.7.output.dense.bias', 'text_encoder.encoder.layer.7.output.LayerNorm.weight', 'text_encoder.encoder.layer.7.output.LayerNorm.bias', 'text_encoder.encoder.layer.8.attention.self.query.weight', 'text_encoder.encoder.layer.8.attention.self.query.bias', 'text_encoder.encoder.layer.8.attention.self.key.weight', 'text_encoder.encoder.layer.8.attention.self.key.bias', 'text_encoder.encoder.layer.8.attention.self.value.weight', 'text_encoder.encoder.layer.8.attention.self.value.bias', 'text_encoder.encoder.layer.8.attention.output.dense.weight', 'text_encoder.encoder.layer.8.attention.output.dense.bias', 'text_encoder.encoder.layer.8.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.8.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.8.intermediate.dense.weight', 'text_encoder.encoder.layer.8.intermediate.dense.bias', 'text_encoder.encoder.layer.8.output.dense.weight', 'text_encoder.encoder.layer.8.output.dense.bias', 'text_encoder.encoder.layer.8.output.LayerNorm.weight', 'text_encoder.encoder.layer.8.output.LayerNorm.bias', 'text_encoder.encoder.layer.9.attention.self.query.weight', 'text_encoder.encoder.layer.9.attention.self.query.bias', 'text_encoder.encoder.layer.9.attention.self.key.weight', 'text_encoder.encoder.layer.9.attention.self.key.bias', 'text_encoder.encoder.layer.9.attention.self.value.weight', 'text_encoder.encoder.layer.9.attention.self.value.bias', 'text_encoder.encoder.layer.9.attention.output.dense.weight', 'text_encoder.encoder.layer.9.attention.output.dense.bias', 'text_encoder.encoder.layer.9.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.9.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.9.intermediate.dense.weight', 'text_encoder.encoder.layer.9.intermediate.dense.bias', 'text_encoder.encoder.layer.9.output.dense.weight', 'text_encoder.encoder.layer.9.output.dense.bias', 'text_encoder.encoder.layer.9.output.LayerNorm.weight', 'text_encoder.encoder.layer.9.output.LayerNorm.bias', 'text_encoder.encoder.layer.10.attention.self.query.weight', 'text_encoder.encoder.layer.10.attention.self.query.bias', 'text_encoder.encoder.layer.10.attention.self.key.weight', 'text_encoder.encoder.layer.10.attention.self.key.bias', 'text_encoder.encoder.layer.10.attention.self.value.weight', 'text_encoder.encoder.layer.10.attention.self.value.bias', 'text_encoder.encoder.layer.10.attention.output.dense.weight', 'text_encoder.encoder.layer.10.attention.output.dense.bias', 'text_encoder.encoder.layer.10.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.10.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.10.intermediate.dense.weight', 'text_encoder.encoder.layer.10.intermediate.dense.bias', 'text_encoder.encoder.layer.10.output.dense.weight', 'text_encoder.encoder.layer.10.output.dense.bias', 'text_encoder.encoder.layer.10.output.LayerNorm.weight', 'text_encoder.encoder.layer.10.output.LayerNorm.bias', 'text_encoder.encoder.layer.11.attention.self.query.weight', 'text_encoder.encoder.layer.11.attention.self.query.bias', 'text_encoder.encoder.layer.11.attention.self.key.weight', 'text_encoder.encoder.layer.11.attention.self.key.bias', 'text_encoder.encoder.layer.11.attention.self.value.weight', 'text_encoder.encoder.layer.11.attention.self.value.bias', 'text_encoder.encoder.layer.11.attention.output.dense.weight', 'text_encoder.encoder.layer.11.attention.output.dense.bias', 'text_encoder.encoder.layer.11.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.11.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.11.intermediate.dense.weight', 'text_encoder.encoder.layer.11.intermediate.dense.bias', 'text_encoder.encoder.layer.11.output.dense.weight', 'text_encoder.encoder.layer.11.output.dense.bias', 'text_encoder.encoder.layer.11.output.LayerNorm.weight', 'text_encoder.encoder.layer.11.output.LayerNorm.bias', 'text_encoder.encoder.layer.12.attention.self.query.weight', 'text_encoder.encoder.layer.12.attention.self.query.bias', 'text_encoder.encoder.layer.12.attention.self.key.weight', 'text_encoder.encoder.layer.12.attention.self.key.bias', 'text_encoder.encoder.layer.12.attention.self.value.weight', 'text_encoder.encoder.layer.12.attention.self.value.bias', 'text_encoder.encoder.layer.12.attention.output.dense.weight', 'text_encoder.encoder.layer.12.attention.output.dense.bias', 'text_encoder.encoder.layer.12.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.12.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.12.intermediate.dense.weight', 'text_encoder.encoder.layer.12.intermediate.dense.bias', 'text_encoder.encoder.layer.12.output.dense.weight', 'text_encoder.encoder.layer.12.output.dense.bias', 'text_encoder.encoder.layer.12.output.LayerNorm.weight', 'text_encoder.encoder.layer.12.output.LayerNorm.bias', 'text_encoder.encoder.layer.13.attention.self.query.weight', 'text_encoder.encoder.layer.13.attention.self.query.bias', 'text_encoder.encoder.layer.13.attention.self.key.weight', 'text_encoder.encoder.layer.13.attention.self.key.bias', 'text_encoder.encoder.layer.13.attention.self.value.weight', 'text_encoder.encoder.layer.13.attention.self.value.bias', 'text_encoder.encoder.layer.13.attention.output.dense.weight', 'text_encoder.encoder.layer.13.attention.output.dense.bias', 'text_encoder.encoder.layer.13.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.13.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.13.intermediate.dense.weight', 'text_encoder.encoder.layer.13.intermediate.dense.bias', 'text_encoder.encoder.layer.13.output.dense.weight', 'text_encoder.encoder.layer.13.output.dense.bias', 'text_encoder.encoder.layer.13.output.LayerNorm.weight', 'text_encoder.encoder.layer.13.output.LayerNorm.bias', 'text_encoder.encoder.layer.14.attention.self.query.weight', 'text_encoder.encoder.layer.14.attention.self.query.bias', 'text_encoder.encoder.layer.14.attention.self.key.weight', 'text_encoder.encoder.layer.14.attention.self.key.bias', 'text_encoder.encoder.layer.14.attention.self.value.weight', 'text_encoder.encoder.layer.14.attention.self.value.bias', 'text_encoder.encoder.layer.14.attention.output.dense.weight', 'text_encoder.encoder.layer.14.attention.output.dense.bias', 'text_encoder.encoder.layer.14.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.14.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.14.intermediate.dense.weight', 'text_encoder.encoder.layer.14.intermediate.dense.bias', 'text_encoder.encoder.layer.14.output.dense.weight', 'text_encoder.encoder.layer.14.output.dense.bias', 'text_encoder.encoder.layer.14.output.LayerNorm.weight', 'text_encoder.encoder.layer.14.output.LayerNorm.bias', 'text_encoder.encoder.layer.15.attention.self.query.weight', 'text_encoder.encoder.layer.15.attention.self.query.bias', 'text_encoder.encoder.layer.15.attention.self.key.weight', 'text_encoder.encoder.layer.15.attention.self.key.bias', 'text_encoder.encoder.layer.15.attention.self.value.weight', 'text_encoder.encoder.layer.15.attention.self.value.bias', 'text_encoder.encoder.layer.15.attention.output.dense.weight', 'text_encoder.encoder.layer.15.attention.output.dense.bias', 'text_encoder.encoder.layer.15.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.15.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.15.intermediate.dense.weight', 'text_encoder.encoder.layer.15.intermediate.dense.bias', 'text_encoder.encoder.layer.15.output.dense.weight', 'text_encoder.encoder.layer.15.output.dense.bias', 'text_encoder.encoder.layer.15.output.LayerNorm.weight', 'text_encoder.encoder.layer.15.output.LayerNorm.bias', 'text_encoder.encoder.layer.16.attention.self.query.weight', 'text_encoder.encoder.layer.16.attention.self.query.bias', 'text_encoder.encoder.layer.16.attention.self.key.weight', 'text_encoder.encoder.layer.16.attention.self.key.bias', 'text_encoder.encoder.layer.16.attention.self.value.weight', 'text_encoder.encoder.layer.16.attention.self.value.bias', 'text_encoder.encoder.layer.16.attention.output.dense.weight', 'text_encoder.encoder.layer.16.attention.output.dense.bias', 'text_encoder.encoder.layer.16.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.16.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.16.intermediate.dense.weight', 'text_encoder.encoder.layer.16.intermediate.dense.bias', 'text_encoder.encoder.layer.16.output.dense.weight', 'text_encoder.encoder.layer.16.output.dense.bias', 'text_encoder.encoder.layer.16.output.LayerNorm.weight', 'text_encoder.encoder.layer.16.output.LayerNorm.bias', 'text_encoder.encoder.layer.17.attention.self.query.weight', 'text_encoder.encoder.layer.17.attention.self.query.bias', 'text_encoder.encoder.layer.17.attention.self.key.weight', 'text_encoder.encoder.layer.17.attention.self.key.bias', 'text_encoder.encoder.layer.17.attention.self.value.weight', 'text_encoder.encoder.layer.17.attention.self.value.bias', 'text_encoder.encoder.layer.17.attention.output.dense.weight', 'text_encoder.encoder.layer.17.attention.output.dense.bias', 'text_encoder.encoder.layer.17.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.17.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.17.intermediate.dense.weight', 'text_encoder.encoder.layer.17.intermediate.dense.bias', 'text_encoder.encoder.layer.17.output.dense.weight', 'text_encoder.encoder.layer.17.output.dense.bias', 'text_encoder.encoder.layer.17.output.LayerNorm.weight', 'text_encoder.encoder.layer.17.output.LayerNorm.bias', 'text_encoder.encoder.layer.18.attention.self.query.weight', 'text_encoder.encoder.layer.18.attention.self.query.bias', 'text_encoder.encoder.layer.18.attention.self.key.weight', 'text_encoder.encoder.layer.18.attention.self.key.bias', 'text_encoder.encoder.layer.18.attention.self.value.weight', 'text_encoder.encoder.layer.18.attention.self.value.bias', 'text_encoder.encoder.layer.18.attention.output.dense.weight', 'text_encoder.encoder.layer.18.attention.output.dense.bias', 'text_encoder.encoder.layer.18.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.18.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.18.intermediate.dense.weight', 'text_encoder.encoder.layer.18.intermediate.dense.bias', 'text_encoder.encoder.layer.18.output.dense.weight', 'text_encoder.encoder.layer.18.output.dense.bias', 'text_encoder.encoder.layer.18.output.LayerNorm.weight', 'text_encoder.encoder.layer.18.output.LayerNorm.bias', 'text_encoder.encoder.layer.19.attention.self.query.weight', 'text_encoder.encoder.layer.19.attention.self.query.bias', 'text_encoder.encoder.layer.19.attention.self.key.weight', 'text_encoder.encoder.layer.19.attention.self.key.bias', 'text_encoder.encoder.layer.19.attention.self.value.weight', 'text_encoder.encoder.layer.19.attention.self.value.bias', 'text_encoder.encoder.layer.19.attention.output.dense.weight', 'text_encoder.encoder.layer.19.attention.output.dense.bias', 'text_encoder.encoder.layer.19.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.19.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.19.crossattention.self.query.weight', 'text_encoder.encoder.layer.19.crossattention.self.query.bias', 'text_encoder.encoder.layer.19.crossattention.self.key.weight', 'text_encoder.encoder.layer.19.crossattention.self.key.bias', 'text_encoder.encoder.layer.19.crossattention.self.value.weight', 'text_encoder.encoder.layer.19.crossattention.self.value.bias', 'text_encoder.encoder.layer.19.crossattention.output.dense.weight', 'text_encoder.encoder.layer.19.crossattention.output.dense.bias', 'text_encoder.encoder.layer.19.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.19.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.19.intermediate.dense.weight', 'text_encoder.encoder.layer.19.intermediate.dense.bias', 'text_encoder.encoder.layer.19.output.dense.weight', 'text_encoder.encoder.layer.19.output.dense.bias', 'text_encoder.encoder.layer.19.output.LayerNorm.weight', 'text_encoder.encoder.layer.19.output.LayerNorm.bias', 'text_encoder.encoder.layer.20.attention.self.query.weight', 'text_encoder.encoder.layer.20.attention.self.query.bias', 'text_encoder.encoder.layer.20.attention.self.key.weight', 'text_encoder.encoder.layer.20.attention.self.key.bias', 'text_encoder.encoder.layer.20.attention.self.value.weight', 'text_encoder.encoder.layer.20.attention.self.value.bias', 'text_encoder.encoder.layer.20.attention.output.dense.weight', 'text_encoder.encoder.layer.20.attention.output.dense.bias', 'text_encoder.encoder.layer.20.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.20.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.20.crossattention.self.query.weight', 'text_encoder.encoder.layer.20.crossattention.self.query.bias', 'text_encoder.encoder.layer.20.crossattention.self.key.weight', 'text_encoder.encoder.layer.20.crossattention.self.key.bias', 'text_encoder.encoder.layer.20.crossattention.self.value.weight', 'text_encoder.encoder.layer.20.crossattention.self.value.bias', 'text_encoder.encoder.layer.20.crossattention.output.dense.weight', 'text_encoder.encoder.layer.20.crossattention.output.dense.bias', 'text_encoder.encoder.layer.20.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.20.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.20.intermediate.dense.weight', 'text_encoder.encoder.layer.20.intermediate.dense.bias', 'text_encoder.encoder.layer.20.output.dense.weight', 'text_encoder.encoder.layer.20.output.dense.bias', 'text_encoder.encoder.layer.20.output.LayerNorm.weight', 'text_encoder.encoder.layer.20.output.LayerNorm.bias', 'text_encoder.encoder.layer.21.attention.self.query.weight', 'text_encoder.encoder.layer.21.attention.self.query.bias', 'text_encoder.encoder.layer.21.attention.self.key.weight', 'text_encoder.encoder.layer.21.attention.self.key.bias', 'text_encoder.encoder.layer.21.attention.self.value.weight', 'text_encoder.encoder.layer.21.attention.self.value.bias', 'text_encoder.encoder.layer.21.attention.output.dense.weight', 'text_encoder.encoder.layer.21.attention.output.dense.bias', 'text_encoder.encoder.layer.21.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.21.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.21.crossattention.self.query.weight', 'text_encoder.encoder.layer.21.crossattention.self.query.bias', 'text_encoder.encoder.layer.21.crossattention.self.key.weight', 'text_encoder.encoder.layer.21.crossattention.self.key.bias', 'text_encoder.encoder.layer.21.crossattention.self.value.weight', 'text_encoder.encoder.layer.21.crossattention.self.value.bias', 'text_encoder.encoder.layer.21.crossattention.output.dense.weight', 'text_encoder.encoder.layer.21.crossattention.output.dense.bias', 'text_encoder.encoder.layer.21.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.21.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.21.intermediate.dense.weight', 'text_encoder.encoder.layer.21.intermediate.dense.bias', 'text_encoder.encoder.layer.21.output.dense.weight', 'text_encoder.encoder.layer.21.output.dense.bias', 'text_encoder.encoder.layer.21.output.LayerNorm.weight', 'text_encoder.encoder.layer.21.output.LayerNorm.bias', 'text_encoder.encoder.layer.22.attention.self.query.weight', 'text_encoder.encoder.layer.22.attention.self.query.bias', 'text_encoder.encoder.layer.22.attention.self.key.weight', 'text_encoder.encoder.layer.22.attention.self.key.bias', 'text_encoder.encoder.layer.22.attention.self.value.weight', 'text_encoder.encoder.layer.22.attention.self.value.bias', 'text_encoder.encoder.layer.22.attention.output.dense.weight', 'text_encoder.encoder.layer.22.attention.output.dense.bias', 'text_encoder.encoder.layer.22.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.22.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.22.crossattention.self.query.weight', 'text_encoder.encoder.layer.22.crossattention.self.query.bias', 'text_encoder.encoder.layer.22.crossattention.self.key.weight', 'text_encoder.encoder.layer.22.crossattention.self.key.bias', 'text_encoder.encoder.layer.22.crossattention.self.value.weight', 'text_encoder.encoder.layer.22.crossattention.self.value.bias', 'text_encoder.encoder.layer.22.crossattention.output.dense.weight', 'text_encoder.encoder.layer.22.crossattention.output.dense.bias', 'text_encoder.encoder.layer.22.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.22.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.22.intermediate.dense.weight', 'text_encoder.encoder.layer.22.intermediate.dense.bias', 'text_encoder.encoder.layer.22.output.dense.weight', 'text_encoder.encoder.layer.22.output.dense.bias', 'text_encoder.encoder.layer.22.output.LayerNorm.weight', 'text_encoder.encoder.layer.22.output.LayerNorm.bias', 'text_encoder.encoder.layer.23.attention.self.query.weight', 'text_encoder.encoder.layer.23.attention.self.query.bias', 'text_encoder.encoder.layer.23.attention.self.key.weight', 'text_encoder.encoder.layer.23.attention.self.key.bias', 'text_encoder.encoder.layer.23.attention.self.value.weight', 'text_encoder.encoder.layer.23.attention.self.value.bias', 'text_encoder.encoder.layer.23.attention.output.dense.weight', 'text_encoder.encoder.layer.23.attention.output.dense.bias', 'text_encoder.encoder.layer.23.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.23.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.23.crossattention.self.query.weight', 'text_encoder.encoder.layer.23.crossattention.self.query.bias', 'text_encoder.encoder.layer.23.crossattention.self.key.weight', 'text_encoder.encoder.layer.23.crossattention.self.key.bias', 'text_encoder.encoder.layer.23.crossattention.self.value.weight', 'text_encoder.encoder.layer.23.crossattention.self.value.bias', 'text_encoder.encoder.layer.23.crossattention.output.dense.weight', 'text_encoder.encoder.layer.23.crossattention.output.dense.bias', 'text_encoder.encoder.layer.23.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.23.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.23.intermediate.dense.weight', 'text_encoder.encoder.layer.23.intermediate.dense.bias', 'text_encoder.encoder.layer.23.output.dense.weight', 'text_encoder.encoder.layer.23.output.dense.bias', 'text_encoder.encoder.layer.23.output.LayerNorm.weight', 'text_encoder.encoder.layer.23.output.LayerNorm.bias'], unexpected_keys=['temp', 'itm_head.weight', 'itm_head.bias', 'text_encoder.bert.embeddings.position_ids', 'text_encoder.bert.embeddings.word_embeddings.weight', 'text_encoder.bert.embeddings.position_embeddings.weight', 'text_encoder.bert.embeddings.token_type_embeddings.weight', 'text_encoder.bert.embeddings.LayerNorm.weight', 'text_encoder.bert.embeddings.LayerNorm.bias', 'text_encoder.bert.encoder.layer.0.attention.self.query.weight', 'text_encoder.bert.encoder.layer.0.attention.self.query.bias', 'text_encoder.bert.encoder.layer.0.attention.self.key.weight', 'text_encoder.bert.encoder.layer.0.attention.self.key.bias', 'text_encoder.bert.encoder.layer.0.attention.self.value.weight', 'text_encoder.bert.encoder.layer.0.attention.self.value.bias', 'text_encoder.bert.encoder.layer.0.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.0.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.0.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.0.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.0.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.0.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.0.output.dense.weight', 'text_encoder.bert.encoder.layer.0.output.dense.bias', 'text_encoder.bert.encoder.layer.0.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.0.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.1.attention.self.query.weight', 'text_encoder.bert.encoder.layer.1.attention.self.query.bias', 'text_encoder.bert.encoder.layer.1.attention.self.key.weight', 'text_encoder.bert.encoder.layer.1.attention.self.key.bias', 'text_encoder.bert.encoder.layer.1.attention.self.value.weight', 'text_encoder.bert.encoder.layer.1.attention.self.value.bias', 'text_encoder.bert.encoder.layer.1.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.1.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.1.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.1.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.1.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.1.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.1.output.dense.weight', 'text_encoder.bert.encoder.layer.1.output.dense.bias', 'text_encoder.bert.encoder.layer.1.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.1.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.2.attention.self.query.weight', 'text_encoder.bert.encoder.layer.2.attention.self.query.bias', 'text_encoder.bert.encoder.layer.2.attention.self.key.weight', 'text_encoder.bert.encoder.layer.2.attention.self.key.bias', 'text_encoder.bert.encoder.layer.2.attention.self.value.weight', 'text_encoder.bert.encoder.layer.2.attention.self.value.bias', 'text_encoder.bert.encoder.layer.2.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.2.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.2.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.2.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.2.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.2.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.2.output.dense.weight', 'text_encoder.bert.encoder.layer.2.output.dense.bias', 'text_encoder.bert.encoder.layer.2.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.2.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.3.attention.self.query.weight', 'text_encoder.bert.encoder.layer.3.attention.self.query.bias', 'text_encoder.bert.encoder.layer.3.attention.self.key.weight', 'text_encoder.bert.encoder.layer.3.attention.self.key.bias', 'text_encoder.bert.encoder.layer.3.attention.self.value.weight', 'text_encoder.bert.encoder.layer.3.attention.self.value.bias', 'text_encoder.bert.encoder.layer.3.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.3.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.3.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.3.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.3.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.3.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.3.output.dense.weight', 'text_encoder.bert.encoder.layer.3.output.dense.bias', 'text_encoder.bert.encoder.layer.3.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.3.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.4.attention.self.query.weight', 'text_encoder.bert.encoder.layer.4.attention.self.query.bias', 'text_encoder.bert.encoder.layer.4.attention.self.key.weight', 'text_encoder.bert.encoder.layer.4.attention.self.key.bias', 'text_encoder.bert.encoder.layer.4.attention.self.value.weight', 'text_encoder.bert.encoder.layer.4.attention.self.value.bias', 'text_encoder.bert.encoder.layer.4.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.4.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.4.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.4.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.4.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.4.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.4.output.dense.weight', 'text_encoder.bert.encoder.layer.4.output.dense.bias', 'text_encoder.bert.encoder.layer.4.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.4.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.5.attention.self.query.weight', 'text_encoder.bert.encoder.layer.5.attention.self.query.bias', 'text_encoder.bert.encoder.layer.5.attention.self.key.weight', 'text_encoder.bert.encoder.layer.5.attention.self.key.bias', 'text_encoder.bert.encoder.layer.5.attention.self.value.weight', 'text_encoder.bert.encoder.layer.5.attention.self.value.bias', 'text_encoder.bert.encoder.layer.5.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.5.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.5.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.5.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.5.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.5.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.5.output.dense.weight', 'text_encoder.bert.encoder.layer.5.output.dense.bias', 'text_encoder.bert.encoder.layer.5.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.5.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.6.attention.self.query.weight', 'text_encoder.bert.encoder.layer.6.attention.self.query.bias', 'text_encoder.bert.encoder.layer.6.attention.self.key.weight', 'text_encoder.bert.encoder.layer.6.attention.self.key.bias', 'text_encoder.bert.encoder.layer.6.attention.self.value.weight', 'text_encoder.bert.encoder.layer.6.attention.self.value.bias', 'text_encoder.bert.encoder.layer.6.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.6.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.6.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.6.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.6.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.6.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.6.output.dense.weight', 'text_encoder.bert.encoder.layer.6.output.dense.bias', 'text_encoder.bert.encoder.layer.6.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.6.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.7.attention.self.query.weight', 'text_encoder.bert.encoder.layer.7.attention.self.query.bias', 'text_encoder.bert.encoder.layer.7.attention.self.key.weight', 'text_encoder.bert.encoder.layer.7.attention.self.key.bias', 'text_encoder.bert.encoder.layer.7.attention.self.value.weight', 'text_encoder.bert.encoder.layer.7.attention.self.value.bias', 'text_encoder.bert.encoder.layer.7.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.7.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.7.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.7.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.7.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.7.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.7.output.dense.weight', 'text_encoder.bert.encoder.layer.7.output.dense.bias', 'text_encoder.bert.encoder.layer.7.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.7.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.8.attention.self.query.weight', 'text_encoder.bert.encoder.layer.8.attention.self.query.bias', 'text_encoder.bert.encoder.layer.8.attention.self.key.weight', 'text_encoder.bert.encoder.layer.8.attention.self.key.bias', 'text_encoder.bert.encoder.layer.8.attention.self.value.weight', 'text_encoder.bert.encoder.layer.8.attention.self.value.bias', 'text_encoder.bert.encoder.layer.8.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.8.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.8.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.8.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.8.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.8.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.8.output.dense.weight', 'text_encoder.bert.encoder.layer.8.output.dense.bias', 'text_encoder.bert.encoder.layer.8.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.8.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.9.attention.self.query.weight', 'text_encoder.bert.encoder.layer.9.attention.self.query.bias', 'text_encoder.bert.encoder.layer.9.attention.self.key.weight', 'text_encoder.bert.encoder.layer.9.attention.self.key.bias', 'text_encoder.bert.encoder.layer.9.attention.self.value.weight', 'text_encoder.bert.encoder.layer.9.attention.self.value.bias', 'text_encoder.bert.encoder.layer.9.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.9.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.9.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.9.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.9.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.9.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.9.output.dense.weight', 'text_encoder.bert.encoder.layer.9.output.dense.bias', 'text_encoder.bert.encoder.layer.9.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.9.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.10.attention.self.query.weight', 'text_encoder.bert.encoder.layer.10.attention.self.query.bias', 'text_encoder.bert.encoder.layer.10.attention.self.key.weight', 'text_encoder.bert.encoder.layer.10.attention.self.key.bias', 'text_encoder.bert.encoder.layer.10.attention.self.value.weight', 'text_encoder.bert.encoder.layer.10.attention.self.value.bias', 'text_encoder.bert.encoder.layer.10.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.10.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.10.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.10.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.10.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.10.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.10.output.dense.weight', 'text_encoder.bert.encoder.layer.10.output.dense.bias', 'text_encoder.bert.encoder.layer.10.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.10.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.11.attention.self.query.weight', 'text_encoder.bert.encoder.layer.11.attention.self.query.bias', 'text_encoder.bert.encoder.layer.11.attention.self.key.weight', 'text_encoder.bert.encoder.layer.11.attention.self.key.bias', 'text_encoder.bert.encoder.layer.11.attention.self.value.weight', 'text_encoder.bert.encoder.layer.11.attention.self.value.bias', 'text_encoder.bert.encoder.layer.11.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.11.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.11.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.11.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.11.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.11.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.11.output.dense.weight', 'text_encoder.bert.encoder.layer.11.output.dense.bias', 'text_encoder.bert.encoder.layer.11.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.11.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.12.attention.self.query.weight', 'text_encoder.bert.encoder.layer.12.attention.self.query.bias', 'text_encoder.bert.encoder.layer.12.attention.self.key.weight', 'text_encoder.bert.encoder.layer.12.attention.self.key.bias', 'text_encoder.bert.encoder.layer.12.attention.self.value.weight', 'text_encoder.bert.encoder.layer.12.attention.self.value.bias', 'text_encoder.bert.encoder.layer.12.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.12.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.12.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.12.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.12.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.12.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.12.output.dense.weight', 'text_encoder.bert.encoder.layer.12.output.dense.bias', 'text_encoder.bert.encoder.layer.12.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.12.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.13.attention.self.query.weight', 'text_encoder.bert.encoder.layer.13.attention.self.query.bias', 'text_encoder.bert.encoder.layer.13.attention.self.key.weight', 'text_encoder.bert.encoder.layer.13.attention.self.key.bias', 'text_encoder.bert.encoder.layer.13.attention.self.value.weight', 'text_encoder.bert.encoder.layer.13.attention.self.value.bias', 'text_encoder.bert.encoder.layer.13.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.13.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.13.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.13.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.13.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.13.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.13.output.dense.weight', 'text_encoder.bert.encoder.layer.13.output.dense.bias', 'text_encoder.bert.encoder.layer.13.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.13.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.14.attention.self.query.weight', 'text_encoder.bert.encoder.layer.14.attention.self.query.bias', 'text_encoder.bert.encoder.layer.14.attention.self.key.weight', 'text_encoder.bert.encoder.layer.14.attention.self.key.bias', 'text_encoder.bert.encoder.layer.14.attention.self.value.weight', 'text_encoder.bert.encoder.layer.14.attention.self.value.bias', 'text_encoder.bert.encoder.layer.14.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.14.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.14.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.14.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.14.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.14.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.14.output.dense.weight', 'text_encoder.bert.encoder.layer.14.output.dense.bias', 'text_encoder.bert.encoder.layer.14.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.14.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.15.attention.self.query.weight', 'text_encoder.bert.encoder.layer.15.attention.self.query.bias', 'text_encoder.bert.encoder.layer.15.attention.self.key.weight', 'text_encoder.bert.encoder.layer.15.attention.self.key.bias', 'text_encoder.bert.encoder.layer.15.attention.self.value.weight', 'text_encoder.bert.encoder.layer.15.attention.self.value.bias', 'text_encoder.bert.encoder.layer.15.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.15.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.15.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.15.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.15.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.15.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.15.output.dense.weight', 'text_encoder.bert.encoder.layer.15.output.dense.bias', 'text_encoder.bert.encoder.layer.15.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.15.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.16.attention.self.query.weight', 'text_encoder.bert.encoder.layer.16.attention.self.query.bias', 'text_encoder.bert.encoder.layer.16.attention.self.key.weight', 'text_encoder.bert.encoder.layer.16.attention.self.key.bias', 'text_encoder.bert.encoder.layer.16.attention.self.value.weight', 'text_encoder.bert.encoder.layer.16.attention.self.value.bias', 'text_encoder.bert.encoder.layer.16.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.16.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.16.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.16.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.16.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.16.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.16.output.dense.weight', 'text_encoder.bert.encoder.layer.16.output.dense.bias', 'text_encoder.bert.encoder.layer.16.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.16.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.17.attention.self.query.weight', 'text_encoder.bert.encoder.layer.17.attention.self.query.bias', 'text_encoder.bert.encoder.layer.17.attention.self.key.weight', 'text_encoder.bert.encoder.layer.17.attention.self.key.bias', 'text_encoder.bert.encoder.layer.17.attention.self.value.weight', 'text_encoder.bert.encoder.layer.17.attention.self.value.bias', 'text_encoder.bert.encoder.layer.17.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.17.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.17.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.17.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.17.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.17.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.17.output.dense.weight', 'text_encoder.bert.encoder.layer.17.output.dense.bias', 'text_encoder.bert.encoder.layer.17.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.17.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.18.attention.self.query.weight', 'text_encoder.bert.encoder.layer.18.attention.self.query.bias', 'text_encoder.bert.encoder.layer.18.attention.self.key.weight', 'text_encoder.bert.encoder.layer.18.attention.self.key.bias', 'text_encoder.bert.encoder.layer.18.attention.self.value.weight', 'text_encoder.bert.encoder.layer.18.attention.self.value.bias', 'text_encoder.bert.encoder.layer.18.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.18.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.18.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.18.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.18.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.18.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.18.output.dense.weight', 'text_encoder.bert.encoder.layer.18.output.dense.bias', 'text_encoder.bert.encoder.layer.18.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.18.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.19.attention.self.query.weight', 'text_encoder.bert.encoder.layer.19.attention.self.query.bias', 'text_encoder.bert.encoder.layer.19.attention.self.key.weight', 'text_encoder.bert.encoder.layer.19.attention.self.key.bias', 'text_encoder.bert.encoder.layer.19.attention.self.value.weight', 'text_encoder.bert.encoder.layer.19.attention.self.value.bias', 'text_encoder.bert.encoder.layer.19.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.19.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.19.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.19.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.19.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.19.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.19.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.19.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.19.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.19.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.19.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.19.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.19.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.19.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.19.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.19.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.19.output.dense.weight', 'text_encoder.bert.encoder.layer.19.output.dense.bias', 'text_encoder.bert.encoder.layer.19.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.19.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.20.attention.self.query.weight', 'text_encoder.bert.encoder.layer.20.attention.self.query.bias', 'text_encoder.bert.encoder.layer.20.attention.self.key.weight', 'text_encoder.bert.encoder.layer.20.attention.self.key.bias', 'text_encoder.bert.encoder.layer.20.attention.self.value.weight', 'text_encoder.bert.encoder.layer.20.attention.self.value.bias', 'text_encoder.bert.encoder.layer.20.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.20.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.20.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.20.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.20.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.20.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.20.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.20.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.20.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.20.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.20.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.20.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.20.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.20.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.20.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.20.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.20.output.dense.weight', 'text_encoder.bert.encoder.layer.20.output.dense.bias', 'text_encoder.bert.encoder.layer.20.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.20.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.21.attention.self.query.weight', 'text_encoder.bert.encoder.layer.21.attention.self.query.bias', 'text_encoder.bert.encoder.layer.21.attention.self.key.weight', 'text_encoder.bert.encoder.layer.21.attention.self.key.bias', 'text_encoder.bert.encoder.layer.21.attention.self.value.weight', 'text_encoder.bert.encoder.layer.21.attention.self.value.bias', 'text_encoder.bert.encoder.layer.21.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.21.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.21.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.21.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.21.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.21.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.21.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.21.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.21.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.21.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.21.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.21.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.21.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.21.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.21.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.21.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.21.output.dense.weight', 'text_encoder.bert.encoder.layer.21.output.dense.bias', 'text_encoder.bert.encoder.layer.21.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.21.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.22.attention.self.query.weight', 'text_encoder.bert.encoder.layer.22.attention.self.query.bias', 'text_encoder.bert.encoder.layer.22.attention.self.key.weight', 'text_encoder.bert.encoder.layer.22.attention.self.key.bias', 'text_encoder.bert.encoder.layer.22.attention.self.value.weight', 'text_encoder.bert.encoder.layer.22.attention.self.value.bias', 'text_encoder.bert.encoder.layer.22.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.22.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.22.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.22.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.22.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.22.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.22.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.22.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.22.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.22.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.22.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.22.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.22.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.22.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.22.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.22.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.22.output.dense.weight', 'text_encoder.bert.encoder.layer.22.output.dense.bias', 'text_encoder.bert.encoder.layer.22.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.22.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.23.attention.self.query.weight', 'text_encoder.bert.encoder.layer.23.attention.self.query.bias', 'text_encoder.bert.encoder.layer.23.attention.self.key.weight', 'text_encoder.bert.encoder.layer.23.attention.self.key.bias', 'text_encoder.bert.encoder.layer.23.attention.self.value.weight', 'text_encoder.bert.encoder.layer.23.attention.self.value.bias', 'text_encoder.bert.encoder.layer.23.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.23.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.23.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.23.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.23.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.23.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.23.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.23.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.23.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.23.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.23.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.23.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.23.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.23.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.23.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.23.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.23.output.dense.weight', 'text_encoder.bert.encoder.layer.23.output.dense.bias', 'text_encoder.bert.encoder.layer.23.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.23.output.LayerNorm.bias', 'text_encoder.cls.predictions.bias', 'text_encoder.cls.predictions.transform.dense.weight', 'text_encoder.cls.predictions.transform.dense.bias', 'text_encoder.cls.predictions.transform.LayerNorm.weight', 'text_encoder.cls.predictions.transform.LayerNorm.bias', 'text_encoder.cls.predictions.decoder.weight', 'text_encoder.cls.predictions.decoder.bias'])\n"
]
}
],
"source": [
"if 'intern_model' in locals():\n",
" del intern_model\n",
" del tokenizer\n",
"config = Config.from_file('/home/toolkit/eai_urlb/InternVideo/InternVideo2/multi_modality/demo/internvideo2_stage2_config.py')\n",
"config = eval_dict_leaf(config)\n",
"config.model.vision_encoder.num_frames = 8\n",
"config.num_frames = 8\n",
"config.num_frames_test = 8\n",
"config.model.text_encoder.pretrained = '/home/toolkit/.cache/huggingface/hub/models--bert-large-uncased/snapshots/6da4b6a26a1877e173fca3225479512db81a5e5b/'\n",
"config.model.text_encoder.config = '/home/toolkit/eai_urlb/InternVideo/InternVideo2/multi_modality/' + config.model.text_encoder.config\n",
"model_pth = '/home/toolkit/eai_urlb/InternVideo/InternVideo2/download_models/InternVideo2-stage2_1b-224p-f4.pt'\n",
"config.pretrained_path = model_pth\n",
"config['model']['vision_encoder']['pretrained'] = model_pth\n",
"intern_model, tokenizer = setup_internvideo2(config) "
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Video tensor([0.0030], device='cuda:0')\n",
"Text tensor([-0.0008, -0.0001, -0.0013, -0.0014, 0.0005, -0.0004, -0.0004, -0.0006,\n",
" 0.0001, -0.0003, 0.0003, 0.0012, -0.0004, 0.0007, -0.0014, -0.0017,\n",
" -0.0007, -0.0018, -0.0006, -0.0024], device='cuda:0')\n",
"text: Somebody walking ~ prob: 0.6945\n",
"text: Playing with hat ~ prob: 0.1198\n",
"text: A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride. ~ prob: 0.0297\n",
"text: A person with a hat ~ prob: 0.0245\n",
"text: A person dressed in a blue jacket shovels the snow-covered pavement outside their house. ~ prob: 0.0226\n",
"text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.0222\n",
"text: A group of friends playing bowling. ~ prob: 0.0212\n",
"text: A person lying in bed ~ prob: 0.0208\n",
"text: A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery. ~ prob: 0.0186\n",
"text: A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees. ~ prob: 0.0102\n",
"text: A person playing with a kid in the street ~ prob: 0.0045\n",
"text: A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys. ~ prob: 0.0025\n",
"text: A playful dog slides down a snowy hill, wagging its tail with delight. ~ prob: 0.0024\n",
"text: A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees. ~ prob: 0.0015\n"
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
],
"source": [
"intern_model.eval()\n",
"texts, probs = retrieve_text(frames, text_candidates, model=intern_model, topk=14, config=config)\n",
"\n",
"# Video tensor([0.0023], device='cuda:0')\n",
"# Text tensor([-0.0008, -0.0001, -0.0013, -0.0014, 0.0005, -0.0004, -0.0004, -0.0006,\n",
"# 0.0001, -0.0003, 0.0003, 0.0012, -0.0004, 0.0007, -0.0014, -0.0017,\n",
"# -0.0007, -0.0018, -0.0006], device='cuda:0')\n",
"# text: A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery. ~ prob: 0.4592\n",
"# text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.1335\n",
"# text: A japanese girl eating noodles ~ prob: 0.1089\n",
"\n",
"for t, p in zip(texts, probs):\n",
" print(f'text: {t} ~ prob: {p:.4f}')"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Parameter containing:\n",
"tensor([[[[[-4.7913e-03, -2.1515e-03, -2.0447e-03, ..., 3.2997e-04,\n",
" -3.0212e-03, -7.9727e-04],\n",
" [ 9.7656e-04, 2.4567e-03, 9.8419e-04, ..., -1.8845e-03,\n",
" 2.3193e-03, 3.6621e-03],\n",
" [-3.5095e-04, 2.5940e-03, -2.7618e-03, ..., -3.7956e-04,\n",
" -3.1948e-05, 1.7166e-03],\n",
" ...,\n",
" [-3.8605e-03, -1.1215e-03, -9.0790e-04, ..., 6.5994e-04,\n",
" 1.0071e-03, 1.2894e-03],\n",
" [-2.2278e-03, 1.2589e-03, -1.0204e-04, ..., 3.7079e-03,\n",
" 1.5354e-04, -8.3160e-04],\n",
" [ 6.3324e-04, 1.4114e-03, 9.5367e-04, ..., -3.4485e-03,\n",
" -1.8234e-03, -4.0283e-03]]],\n",
"\n",
"\n",
" [[[ 7.0190e-04, -1.3657e-03, -6.5994e-04, ..., 1.4725e-03,\n",
" -8.5831e-04, 1.6212e-04],\n",
" [ 1.7262e-04, 8.0872e-04, 4.1485e-05, ..., -6.4850e-04,\n",
" 5.5695e-04, 1.7242e-03],\n",
" [ 1.3504e-03, 3.2959e-03, -1.3275e-03, ..., 2.2736e-03,\n",
" 4.2725e-04, 1.9150e-03],\n",
" ...,\n",
" [-2.3041e-03, -6.4850e-04, -2.8839e-03, ..., 2.9755e-04,\n",
" -3.0518e-04, 1.2817e-03],\n",
" [ 9.3079e-04, -1.2512e-03, -1.5335e-03, ..., 1.9455e-03,\n",
" -3.4142e-04, -1.2054e-03],\n",
" [ 9.1553e-03, 3.6774e-03, 2.2125e-03, ..., -5.3883e-05,\n",
" 3.2234e-04, 2.3499e-03]]],\n",
"\n",
"\n",
" [[[ 2.0752e-03, 7.4768e-04, 2.6512e-04, ..., 2.3193e-03,\n",
" -3.3379e-04, -9.2983e-05],\n",
" [ 1.4725e-03, 1.0986e-03, -8.8692e-05, ..., -2.8229e-04,\n",
" 7.2098e-04, -2.2888e-03],\n",
" [ 1.3809e-03, 1.5945e-03, 6.5231e-04, ..., 3.3112e-03,\n",
" 2.1515e-03, -1.4114e-03],\n",
" ...,\n",
" [-1.2512e-03, 1.0605e-03, 5.6744e-05, ..., -4.7112e-04,\n",
" -3.4714e-04, -1.6861e-03],\n",
" [-5.4550e-04, 1.1978e-03, 1.9531e-03, ..., 7.6675e-04,\n",
" -1.9150e-03, -1.6937e-03],\n",
" [-4.5776e-03, -3.0212e-03, -1.4648e-03, ..., -1.0757e-03,\n",
" 1.0061e-04, 2.9449e-03]]]],\n",
"\n",
"\n",
"\n",
" [[[[-9.9487e-03, -5.9814e-03, 3.9673e-03, ..., 7.8125e-03,\n",
" 4.5776e-03, -4.7607e-03],\n",
" [-7.5989e-03, 2.5940e-04, -6.0730e-03, ..., -1.4725e-03,\n",
" -3.8300e-03, -2.4567e-03],\n",
" [ 6.9427e-04, 3.1090e-04, -2.1515e-03, ..., -1.2779e-04,\n",
" -6.0120e-03, -1.4191e-03],\n",
" ...,\n",
" [ 1.1597e-02, -8.3447e-05, -1.3428e-03, ..., -4.4556e-03,\n",
" -4.4823e-04, -1.6861e-03],\n",
" [ 4.3640e-03, -2.0447e-03, -1.3123e-03, ..., -4.4556e-03,\n",
" -4.0283e-03, -4.6387e-03],\n",
" [ 9.2163e-03, -5.1880e-03, 1.3351e-03, ..., 4.7112e-04,\n",
" 2.6550e-03, 4.9744e-03]]],\n",
"\n",
"\n",
" [[[ 4.3030e-03, -6.3171e-03, -1.2436e-03, ..., 2.1210e-03,\n",
" -9.4250e-07, -1.0559e-02],\n",
" [-1.2436e-03, -4.1504e-03, -9.7046e-03, ..., -2.8687e-03,\n",
" -6.9885e-03, -9.7046e-03],\n",
" [ 6.2561e-04, -5.7678e-03, -3.9978e-03, ..., -1.9989e-03,\n",
" -4.5166e-03, -5.5542e-03],\n",
" ...,\n",
" [ 7.6904e-03, -2.9144e-03, -2.0905e-03, ..., -3.9368e-03,\n",
" 2.1515e-03, -3.9062e-03],\n",
" [-6.9427e-04, -2.9907e-03, -1.2512e-03, ..., 1.6785e-03,\n",
" 5.8594e-03, -2.0294e-03],\n",
" [ 7.8678e-05, -6.0730e-03, 1.0834e-03, ..., 2.9564e-04,\n",
" 3.1738e-03, -8.4839e-03]]],\n",
"\n",
"\n",
" [[[ 4.2419e-03, -7.5073e-03, -2.8381e-03, ..., -7.7515e-03,\n",
" -6.6223e-03, 2.1667e-03],\n",
" [ 8.1787e-03, 5.8899e-03, 1.0376e-03, ..., -1.8463e-03,\n",
" -3.1281e-03, 5.8899e-03],\n",
" [ 1.7776e-03, -4.2915e-04, 8.6975e-04, ..., -4.2915e-05,\n",
" -3.2043e-03, 9.5825e-03],\n",
" ...,\n",
" [ 2.7847e-04, -1.9989e-03, -5.2490e-03, ..., -5.7068e-03,\n",
" -4.5776e-04, -3.5095e-03],\n",
" [-5.3406e-03, -6.9427e-04, -4.9133e-03, ..., -1.0910e-03,\n",
" -6.4468e-04, -5.1880e-03],\n",
" [-7.3853e-03, -2.1210e-03, 4.7302e-03, ..., 2.0752e-03,\n",
" -2.0447e-03, -1.2329e-02]]]],\n",
"\n",
"\n",
"\n",
" [[[[ 4.4861e-03, -2.8992e-03, -4.7302e-03, ..., -5.3406e-03,\n",
" -4.6692e-03, -4.6387e-03],\n",
" [ 1.8921e-03, -5.6458e-03, -3.7079e-03, ..., -2.5482e-03,\n",
" -4.8218e-03, 2.1515e-03],\n",
" [ 4.5471e-03, 2.9755e-04, -3.7842e-03, ..., 3.6774e-03,\n",
" -2.6550e-03, -1.8845e-03],\n",
" ...,\n",
" [ 7.2098e-04, 3.1281e-03, 2.0027e-04, ..., 2.7924e-03,\n",
" 1.0986e-03, 3.4943e-03],\n",
" [ 1.4496e-03, -2.8229e-04, 7.0801e-03, ..., 1.0071e-03,\n",
" -3.9978e-03, 3.7689e-03],\n",
" [ 9.9945e-04, 7.3624e-04, 9.7046e-03, ..., 3.9673e-03,\n",
" 6.7139e-03, 1.1414e-02]]],\n",
"\n",
"\n",
" [[[-3.5400e-03, -6.9809e-04, 3.9673e-03, ..., 7.1716e-04,\n",
" 2.3651e-03, 1.6098e-03],\n",
" [-1.7319e-03, 8.0109e-04, 2.7466e-03, ..., -1.7262e-04,\n",
" -1.6937e-03, 6.1340e-03],\n",
" [-3.9978e-03, 2.0599e-03, -2.4414e-03, ..., 2.2888e-03,\n",
" 2.2736e-03, 4.1809e-03],\n",
" ...,\n",
" [-6.6223e-03, -1.0529e-03, -3.0823e-03, ..., 1.2894e-03,\n",
" 1.7624e-03, -6.0425e-03],\n",
" [ 7.5531e-04, -2.0599e-03, 2.0142e-03, ..., 3.3569e-03,\n",
" 1.8215e-04, -7.1411e-03],\n",
" [ 5.5237e-03, 2.3842e-04, 7.2937e-03, ..., -4.1809e-03,\n",
" -4.4861e-03, -1.7700e-02]]],\n",
"\n",
"\n",
" [[[-7.1716e-04, -1.5488e-03, -2.5635e-03, ..., 1.8692e-03,\n",
" 5.4016e-03, 3.8300e-03],\n",
" [ 1.9531e-03, -9.1934e-04, 2.0981e-05, ..., -5.3024e-04,\n",
" -1.9989e-03, 1.1778e-04],\n",
" [ 2.5635e-03, 4.4556e-03, -3.9978e-03, ..., 1.7548e-03,\n",
" 1.4114e-04, -1.2817e-03],\n",
" ...,\n",
" [-1.9226e-03, -4.1389e-04, -4.2114e-03, ..., 8.2016e-04,\n",
" 5.0964e-03, 2.5330e-03],\n",
" [ 8.4229e-03, 1.8539e-03, 1.4038e-03, ..., 2.4109e-03,\n",
" 1.8616e-03, -1.0300e-03],\n",
" [-8.6784e-05, -7.3547e-03, -1.5182e-03, ..., 1.5335e-03,\n",
" 2.2736e-03, -8.4839e-03]]]],\n",
"\n",
"\n",
"\n",
" ...,\n",
"\n",
"\n",
"\n",
" [[[[-1.2207e-02, 4.0771e-02, -2.9419e-02, ..., 6.5918e-02,\n",
" -2.6978e-02, 3.0640e-02],\n",
" [ 7.8125e-02, 4.8828e-02, -5.3955e-02, ..., 4.3945e-02,\n",
" -3.3447e-02, 3.4424e-02],\n",
" [-1.3489e-02, 7.2021e-03, -5.0293e-02, ..., -1.9043e-02,\n",
" -4.4189e-02, -3.7354e-02],\n",
" ...,\n",
" [-3.1250e-02, -1.1047e-02, 5.7617e-02, ..., -1.9287e-02,\n",
" 6.2500e-02, -6.5308e-03],\n",
" [-3.1738e-02, 5.6152e-03, -1.0986e-02, ..., -5.6763e-03,\n",
" 2.3804e-02, 6.2500e-02],\n",
" [-3.5400e-02, -4.4861e-03, 3.7109e-02, ..., 3.3691e-02,\n",
" -7.1777e-02, 9.3750e-02]]],\n",
"\n",
"\n",
" [[[-1.5332e-01, 3.3203e-02, -7.8125e-02, ..., 6.9824e-02,\n",
" -1.1902e-02, 6.1340e-03],\n",
" [ 5.1117e-04, 2.8809e-02, -7.9102e-02, ..., 5.4932e-02,\n",
" -8.6670e-03, 2.2827e-02],\n",
" [-7.3853e-03, 6.6528e-03, -4.9561e-02, ..., 5.6076e-04,\n",
" -3.0029e-02, -1.6724e-02],\n",
" ...,\n",
" [ 6.7871e-02, 5.0049e-02, 7.0801e-02, ..., -6.1646e-03,\n",
" 7.9102e-02, -4.5410e-02],\n",
" [ 3.5706e-03, 6.5308e-03, -9.0942e-03, ..., -2.1210e-03,\n",
" 4.9561e-02, 4.6143e-02],\n",
" [ 5.6152e-02, 1.8311e-02, 4.5898e-02, ..., 2.8564e-02,\n",
" -9.3262e-02, -4.9316e-02]]],\n",
"\n",
"\n",
" [[[-5.1270e-02, 9.4727e-02, 2.1973e-02, ..., 4.9072e-02,\n",
" -6.0547e-02, 6.9580e-03],\n",
" [-1.7578e-02, 1.5869e-02, -5.0293e-02, ..., 9.4604e-03,\n",
" -6.9336e-02, 7.2098e-04],\n",
" [-6.4453e-02, -1.0620e-02, -7.0801e-02, ..., 3.5156e-02,\n",
" -4.1016e-02, -3.4912e-02],\n",
" ...,\n",
" [-5.2185e-03, 3.0640e-02, 7.5195e-02, ..., 4.1260e-02,\n",
" 8.6426e-02, -2.6367e-02],\n",
" [ 2.8076e-03, 5.4626e-03, 2.0874e-02, ..., 1.0452e-03,\n",
" -1.2207e-02, 2.1973e-03],\n",
" [-7.7148e-02, -5.0781e-02, 3.3936e-02, ..., 1.7334e-02,\n",
" -1.2988e-01, -5.0781e-02]]]],\n",
"\n",
"\n",
"\n",
" [[[[-5.6885e-02, -1.1035e-01, -2.1118e-02, ..., 5.2002e-02,\n",
" -7.9346e-03, -5.3711e-02],\n",
" [ 2.9053e-02, 1.7944e-02, -1.0315e-02, ..., 3.6621e-02,\n",
" 3.3936e-02, -1.4587e-02],\n",
" [ 1.5259e-04, -2.6245e-02, -9.5703e-02, ..., 9.5825e-03,\n",
" 6.3965e-02, 2.9907e-02],\n",
" ...,\n",
" [ 3.8818e-02, 2.9907e-02, 2.7710e-02, ..., -1.0938e-01,\n",
" -4.6387e-02, 5.1575e-03],\n",
" [-1.7212e-02, 1.6235e-02, -6.0547e-02, ..., -2.7710e-02,\n",
" -5.9204e-03, 2.5024e-02],\n",
" [-4.1504e-02, 1.3794e-02, -8.2520e-02, ..., 4.7852e-02,\n",
" 8.2520e-02, -9.4238e-02]]],\n",
"\n",
"\n",
" [[[ 3.7842e-02, -9.7656e-02, -4.6143e-02, ..., 2.5635e-02,\n",
" -1.6479e-02, 1.9531e-03],\n",
" [ 7.7148e-02, 4.0771e-02, -3.4027e-03, ..., 1.4648e-02,\n",
" -2.0142e-02, -8.6670e-03],\n",
" [ 6.1035e-02, 6.5308e-03, -2.5635e-02, ..., 5.0049e-02,\n",
" 4.8828e-03, -2.4780e-02],\n",
" ...,\n",
" [ 9.9487e-03, -6.6528e-03, -1.1353e-02, ..., -1.1572e-01,\n",
" 1.9043e-02, 5.9082e-02],\n",
" [ 3.8086e-02, 4.2725e-02, -4.7363e-02, ..., 7.5989e-03,\n",
" 6.4087e-03, 7.3853e-03],\n",
" [-7.2632e-03, 9.0820e-02, -5.2979e-02, ..., 4.4678e-02,\n",
" 6.6895e-02, -1.0303e-01]]],\n",
"\n",
"\n",
" [[[ 1.4526e-02, -1.1475e-01, -3.8818e-02, ..., 8.1055e-02,\n",
" 3.8818e-02, 2.1118e-02],\n",
" [ 1.2354e-01, 5.1025e-02, 2.1973e-03, ..., 1.8677e-02,\n",
" 1.5991e-02, -3.3203e-02],\n",
" [ 7.7148e-02, 6.9336e-02, -3.7842e-02, ..., -3.0151e-02,\n",
" 3.9062e-02, -2.2217e-02],\n",
" ...,\n",
" [ 4.3701e-02, 8.2520e-02, 1.0156e-01, ..., -8.5449e-02,\n",
" -3.0060e-03, 1.0547e-01],\n",
" [ 3.0640e-02, 8.0078e-02, 2.1606e-02, ..., -2.0264e-02,\n",
" 1.9287e-02, 7.8613e-02],\n",
" [ 4.4678e-02, 9.7168e-02, -4.9561e-02, ..., 3.6377e-02,\n",
" 1.3477e-01, -2.7771e-03]]]],\n",
"\n",
"\n",
"\n",
" [[[[-2.0996e-01, -4.6387e-02, -5.6458e-03, ..., 1.7334e-02,\n",
" 4.6082e-03, -1.4038e-02],\n",
" [-2.8931e-02, 2.0020e-02, -8.7891e-03, ..., -8.2520e-02,\n",
" -5.2002e-02, -1.5869e-03],\n",
" [ 7.1289e-02, 4.3335e-03, 1.1047e-02, ..., -7.5684e-03,\n",
" -1.7456e-02, 1.5137e-02],\n",
" ...,\n",
" [ 5.0781e-02, 4.3213e-02, -8.1055e-02, ..., -3.9062e-02,\n",
" -1.0693e-01, -2.9175e-02],\n",
" [ 6.2500e-02, -3.9062e-03, -7.8735e-03, ..., -3.6377e-02,\n",
" -4.8340e-02, 4.8340e-02],\n",
" [-8.4473e-02, -3.3447e-02, -6.7383e-02, ..., 4.0527e-02,\n",
" -6.9885e-03, 1.0547e-01]]],\n",
"\n",
"\n",
" [[[-2.2266e-01, -4.6143e-02, -2.9541e-02, ..., 6.8054e-03,\n",
" 3.4180e-02, -2.3682e-02],\n",
" [-2.4170e-02, 5.7129e-02, 4.0771e-02, ..., -4.5898e-02,\n",
" -1.1536e-02, 9.0942e-03],\n",
" [ 7.8613e-02, 4.6631e-02, 9.9182e-04, ..., 4.2236e-02,\n",
" 2.5879e-02, 4.2236e-02],\n",
" ...,\n",
" [ 5.7129e-02, 3.1250e-02, -7.7148e-02, ..., -2.1362e-02,\n",
" -2.8809e-02, -6.3171e-03],\n",
" [ 1.0352e-01, 6.2988e-02, -1.6602e-02, ..., -3.4668e-02,\n",
" -5.9128e-04, 6.2988e-02],\n",
" [-1.0193e-02, 5.0537e-02, -1.6479e-02, ..., 3.1738e-02,\n",
" -4.3945e-02, -2.5146e-02]]],\n",
"\n",
"\n",
" [[[ 5.5176e-02, -2.3804e-02, -2.0020e-02, ..., -5.1270e-03,\n",
" -5.8899e-03, -2.4414e-02],\n",
" [ 5.4199e-02, 4.0894e-03, 8.1787e-03, ..., -1.5320e-02,\n",
" 1.3885e-03, 3.7842e-02],\n",
" [ 1.0938e-01, 5.5847e-03, -1.3184e-02, ..., 4.1260e-02,\n",
" 2.1484e-02, 5.2734e-02],\n",
" ...,\n",
" [-4.3297e-04, -1.1169e-02, -8.5449e-02, ..., 2.0264e-02,\n",
" -4.7363e-02, -3.6774e-03],\n",
" [ 5.5847e-03, -1.7212e-02, -4.9805e-02, ..., -7.6660e-02,\n",
" -2.7466e-02, 4.8340e-02],\n",
" [-9.3750e-02, -3.6377e-02, -6.6895e-02, ..., -3.0029e-02,\n",
" -5.8350e-02, -6.1768e-02]]]]], device='cuda:0')"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\"\"\n",
"Parameter containing:\n",
"tensor([[[[[-4.7913e-03, -2.1515e-03, -2.0447e-03, ..., 3.2997e-04,\n",
" -3.0212e-03, -7.9727e-04],\n",
" [ 9.7656e-04, 2.4567e-03, 9.8419e-04, ..., -1.8845e-03,\n",
" 2.3193e-03, 3.6621e-03],\n",
" [-3.5095e-04, 2.5940e-03, -2.7618e-03, ..., -3.7956e-04,\n",
" -3.1948e-05, 1.7166e-03],\n",
" ...,\n",
" [-3.8605e-03, -1.1215e-03, -9.0790e-04, ..., 6.5994e-04,\n",
" 1.0071e-03, 1.2894e-03],\n",
" [-2.2278e-03, 1.2589e-03, -1.0204e-04, ..., 3.7079e-03,\n",
" 1.5354e-04, -8.3160e-04],\n",
" [ 6.3324e-04, 1.4114e-03, 9.5367e-04, ..., -3.4485e-03,\n",
" -1.8234e-03, -4.0283e-03]]],\n",
"\"\"\"\n",
"intern_model.vision_encoder.patch_embed.proj.weight\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Parameter containing:\n",
"tensor([[ 0.0366, 0.0135, 0.0492, ..., -0.0274, 0.0493, 0.0242],\n",
" [-0.0113, 0.0504, 0.0469, ..., -0.0269, -0.0224, -0.0305],\n",
" [ 0.0192, -0.0152, 0.0119, ..., 0.0115, -0.0182, -0.0063],\n",
" ...,\n",
" [-0.0370, -0.0460, 0.0203, ..., 0.0157, -0.0529, 0.0139],\n",
" [-0.0523, -0.0192, -0.0612, ..., -0.0515, 0.0169, 0.0098],\n",
" [ 0.0277, -0.0029, -0.0349, ..., 0.0014, -0.0453, 0.0052]],\n",
" device='cuda:0')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\"\"\n",
"Parameter containing:\n",
"tensor([[ 0.0366, 0.0135, 0.0492, ..., -0.0274, 0.0493, 0.0242],\n",
" [-0.0113, 0.0504, 0.0469, ..., -0.0269, -0.0224, -0.0305],\n",
" [ 0.0192, -0.0152, 0.0119, ..., 0.0115, -0.0182, -0.0063],\n",
" ...,\n",
" [-0.0370, -0.0460, 0.0203, ..., 0.0157, -0.0529, 0.0139],\n",
" [-0.0523, -0.0192, -0.0612, ..., -0.0515, 0.0169, 0.0098],\n",
" [ 0.0277, -0.0029, -0.0349, ..., 0.0014, -0.0453, 0.0052]],\n",
" device='cuda:0')\n",
"\"\"\"\n",
"intern_model.text_encoder.encoder.layer[0].output.dense.weight"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Video tensor([0.0005], device='cuda:0')\n",
"# Text tensor([-1.1985e-03, 5.7084e-04, -7.3242e-05, -2.1923e-04, 1.3280e-03,\n",
"# 6.7617e-05, -5.6482e-04, 1.3007e-03, \n",
"# 9.1326e-04, 5.7684e-04],\n",
"# device='cuda:0')\n",
"# text: A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees. ~ prob: 0.5572\n",
"# text: A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys. ~ prob: 0.1044\n",
"# text: A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon. ~ prob: 0.0958\n",
"# text: A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride. ~ prob: 0.0936\n",
"# text: A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run. ~ prob: 0.0404"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# /home/toolkit/.conda/envs/urlb_test/lib/python3.8/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
"# warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n",
"# Video tensor([-0.0014], device='cuda:0')\n",
"# Text tensor([-1.8553e-03, -1.8098e-03, 5.9901e-04, -1.9457e-03, 4.7702e-05,\n",
"# -2.8283e-03, -2.2676e-03, 7.7966e-04, -2.1556e-04, -3.8074e-04],\n",
"# device='cuda:0')\n",
"# text: A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride. ~ prob: 0.3186\n",
"# text: A playful dog slides down a snowy hill, wagging its tail with delight. ~ prob: 0.1871\n",
"# text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.1405\n",
"# text: A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys. ~ prob: 0.1344\n",
"# text: A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run. ~ prob: 0.0955"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.18"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
|