File size: 83,949 Bytes
2d9a728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/toolkit/.conda/envs/urlb_test/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "DropoutAddRMSNorm of flash_attn is not installed!!!\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2024-04-16 22:03:29,983] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import os\n",
    "import io\n",
    "import cv2\n",
    "\n",
    "import torch\n",
    "import sys\n",
    "sys.path.insert(0, '/home/toolkit/eai_urlb/InternVideo/InternVideo2/multi_modality/demo/')\n",
    "sys.path.insert(0, '/home/toolkit/eai_urlb/InternVideo/InternVideo2/multi_modality')\n",
    "\n",
    "from small_config import (Config, eval_dict_leaf)\n",
    "from small_utils import (retrieve_text, _frame_from_video, setup_internvideo2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "# video = cv2.VideoCapture('example1.mp4')\n",
    "video = cv2.VideoCapture('../../../../video_samples/person_walking_video.mp4')\n",
    "frames = [x for x in _frame_from_video(video)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_candidates = [\"A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.\",\n",
    "                   \"A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.\",\n",
    "                   \"A person dressed in a blue jacket shovels the snow-covered pavement outside their house.\",\n",
    "                   \"A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.\",\n",
    "                   \"A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.\",\n",
    "                   \"A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.\",\n",
    "                   \"A playful dog slides down a snowy hill, wagging its tail with delight.\",\n",
    "                   \"A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.\",\n",
    "                   \"A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.\",\n",
    "                   \"A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery.\",\n",
    "                   \"A person playing with a kid in the street\",\n",
    "                   \"A group of friends playing bowling.\",\n",
    "                   \"A japanese girl eating noodles\",\n",
    "                   \"A painting by Monet\",\n",
    "                   \"A person lying in bed\",\n",
    "                   \"A person lying down on the grass\",\n",
    "                   \"A person with a hat\",\n",
    "                   \"Playing with hat\",\n",
    "                   \"Somebody walking\",\n",
    "                   \"Fidget spinner\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "load_state_dict: _IncompatibleKeys(missing_keys=['text_encoder.embeddings.position_ids', 'text_encoder.embeddings.word_embeddings.weight', 'text_encoder.embeddings.position_embeddings.weight', 'text_encoder.embeddings.token_type_embeddings.weight', 'text_encoder.embeddings.LayerNorm.weight', 'text_encoder.embeddings.LayerNorm.bias', 'text_encoder.encoder.layer.0.attention.self.query.weight', 'text_encoder.encoder.layer.0.attention.self.query.bias', 'text_encoder.encoder.layer.0.attention.self.key.weight', 'text_encoder.encoder.layer.0.attention.self.key.bias', 'text_encoder.encoder.layer.0.attention.self.value.weight', 'text_encoder.encoder.layer.0.attention.self.value.bias', 'text_encoder.encoder.layer.0.attention.output.dense.weight', 'text_encoder.encoder.layer.0.attention.output.dense.bias', 'text_encoder.encoder.layer.0.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.0.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.0.intermediate.dense.weight', 'text_encoder.encoder.layer.0.intermediate.dense.bias', 'text_encoder.encoder.layer.0.output.dense.weight', 'text_encoder.encoder.layer.0.output.dense.bias', 'text_encoder.encoder.layer.0.output.LayerNorm.weight', 'text_encoder.encoder.layer.0.output.LayerNorm.bias', 'text_encoder.encoder.layer.1.attention.self.query.weight', 'text_encoder.encoder.layer.1.attention.self.query.bias', 'text_encoder.encoder.layer.1.attention.self.key.weight', 'text_encoder.encoder.layer.1.attention.self.key.bias', 'text_encoder.encoder.layer.1.attention.self.value.weight', 'text_encoder.encoder.layer.1.attention.self.value.bias', 'text_encoder.encoder.layer.1.attention.output.dense.weight', 'text_encoder.encoder.layer.1.attention.output.dense.bias', 'text_encoder.encoder.layer.1.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.1.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.1.intermediate.dense.weight', 'text_encoder.encoder.layer.1.intermediate.dense.bias', 'text_encoder.encoder.layer.1.output.dense.weight', 'text_encoder.encoder.layer.1.output.dense.bias', 'text_encoder.encoder.layer.1.output.LayerNorm.weight', 'text_encoder.encoder.layer.1.output.LayerNorm.bias', 'text_encoder.encoder.layer.2.attention.self.query.weight', 'text_encoder.encoder.layer.2.attention.self.query.bias', 'text_encoder.encoder.layer.2.attention.self.key.weight', 'text_encoder.encoder.layer.2.attention.self.key.bias', 'text_encoder.encoder.layer.2.attention.self.value.weight', 'text_encoder.encoder.layer.2.attention.self.value.bias', 'text_encoder.encoder.layer.2.attention.output.dense.weight', 'text_encoder.encoder.layer.2.attention.output.dense.bias', 'text_encoder.encoder.layer.2.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.2.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.2.intermediate.dense.weight', 'text_encoder.encoder.layer.2.intermediate.dense.bias', 'text_encoder.encoder.layer.2.output.dense.weight', 'text_encoder.encoder.layer.2.output.dense.bias', 'text_encoder.encoder.layer.2.output.LayerNorm.weight', 'text_encoder.encoder.layer.2.output.LayerNorm.bias', 'text_encoder.encoder.layer.3.attention.self.query.weight', 'text_encoder.encoder.layer.3.attention.self.query.bias', 'text_encoder.encoder.layer.3.attention.self.key.weight', 'text_encoder.encoder.layer.3.attention.self.key.bias', 'text_encoder.encoder.layer.3.attention.self.value.weight', 'text_encoder.encoder.layer.3.attention.self.value.bias', 'text_encoder.encoder.layer.3.attention.output.dense.weight', 'text_encoder.encoder.layer.3.attention.output.dense.bias', 'text_encoder.encoder.layer.3.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.3.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.3.intermediate.dense.weight', 'text_encoder.encoder.layer.3.intermediate.dense.bias', 'text_encoder.encoder.layer.3.output.dense.weight', 'text_encoder.encoder.layer.3.output.dense.bias', 'text_encoder.encoder.layer.3.output.LayerNorm.weight', 'text_encoder.encoder.layer.3.output.LayerNorm.bias', 'text_encoder.encoder.layer.4.attention.self.query.weight', 'text_encoder.encoder.layer.4.attention.self.query.bias', 'text_encoder.encoder.layer.4.attention.self.key.weight', 'text_encoder.encoder.layer.4.attention.self.key.bias', 'text_encoder.encoder.layer.4.attention.self.value.weight', 'text_encoder.encoder.layer.4.attention.self.value.bias', 'text_encoder.encoder.layer.4.attention.output.dense.weight', 'text_encoder.encoder.layer.4.attention.output.dense.bias', 'text_encoder.encoder.layer.4.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.4.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.4.intermediate.dense.weight', 'text_encoder.encoder.layer.4.intermediate.dense.bias', 'text_encoder.encoder.layer.4.output.dense.weight', 'text_encoder.encoder.layer.4.output.dense.bias', 'text_encoder.encoder.layer.4.output.LayerNorm.weight', 'text_encoder.encoder.layer.4.output.LayerNorm.bias', 'text_encoder.encoder.layer.5.attention.self.query.weight', 'text_encoder.encoder.layer.5.attention.self.query.bias', 'text_encoder.encoder.layer.5.attention.self.key.weight', 'text_encoder.encoder.layer.5.attention.self.key.bias', 'text_encoder.encoder.layer.5.attention.self.value.weight', 'text_encoder.encoder.layer.5.attention.self.value.bias', 'text_encoder.encoder.layer.5.attention.output.dense.weight', 'text_encoder.encoder.layer.5.attention.output.dense.bias', 'text_encoder.encoder.layer.5.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.5.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.5.intermediate.dense.weight', 'text_encoder.encoder.layer.5.intermediate.dense.bias', 'text_encoder.encoder.layer.5.output.dense.weight', 'text_encoder.encoder.layer.5.output.dense.bias', 'text_encoder.encoder.layer.5.output.LayerNorm.weight', 'text_encoder.encoder.layer.5.output.LayerNorm.bias', 'text_encoder.encoder.layer.6.attention.self.query.weight', 'text_encoder.encoder.layer.6.attention.self.query.bias', 'text_encoder.encoder.layer.6.attention.self.key.weight', 'text_encoder.encoder.layer.6.attention.self.key.bias', 'text_encoder.encoder.layer.6.attention.self.value.weight', 'text_encoder.encoder.layer.6.attention.self.value.bias', 'text_encoder.encoder.layer.6.attention.output.dense.weight', 'text_encoder.encoder.layer.6.attention.output.dense.bias', 'text_encoder.encoder.layer.6.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.6.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.6.intermediate.dense.weight', 'text_encoder.encoder.layer.6.intermediate.dense.bias', 'text_encoder.encoder.layer.6.output.dense.weight', 'text_encoder.encoder.layer.6.output.dense.bias', 'text_encoder.encoder.layer.6.output.LayerNorm.weight', 'text_encoder.encoder.layer.6.output.LayerNorm.bias', 'text_encoder.encoder.layer.7.attention.self.query.weight', 'text_encoder.encoder.layer.7.attention.self.query.bias', 'text_encoder.encoder.layer.7.attention.self.key.weight', 'text_encoder.encoder.layer.7.attention.self.key.bias', 'text_encoder.encoder.layer.7.attention.self.value.weight', 'text_encoder.encoder.layer.7.attention.self.value.bias', 'text_encoder.encoder.layer.7.attention.output.dense.weight', 'text_encoder.encoder.layer.7.attention.output.dense.bias', 'text_encoder.encoder.layer.7.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.7.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.7.intermediate.dense.weight', 'text_encoder.encoder.layer.7.intermediate.dense.bias', 'text_encoder.encoder.layer.7.output.dense.weight', 'text_encoder.encoder.layer.7.output.dense.bias', 'text_encoder.encoder.layer.7.output.LayerNorm.weight', 'text_encoder.encoder.layer.7.output.LayerNorm.bias', 'text_encoder.encoder.layer.8.attention.self.query.weight', 'text_encoder.encoder.layer.8.attention.self.query.bias', 'text_encoder.encoder.layer.8.attention.self.key.weight', 'text_encoder.encoder.layer.8.attention.self.key.bias', 'text_encoder.encoder.layer.8.attention.self.value.weight', 'text_encoder.encoder.layer.8.attention.self.value.bias', 'text_encoder.encoder.layer.8.attention.output.dense.weight', 'text_encoder.encoder.layer.8.attention.output.dense.bias', 'text_encoder.encoder.layer.8.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.8.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.8.intermediate.dense.weight', 'text_encoder.encoder.layer.8.intermediate.dense.bias', 'text_encoder.encoder.layer.8.output.dense.weight', 'text_encoder.encoder.layer.8.output.dense.bias', 'text_encoder.encoder.layer.8.output.LayerNorm.weight', 'text_encoder.encoder.layer.8.output.LayerNorm.bias', 'text_encoder.encoder.layer.9.attention.self.query.weight', 'text_encoder.encoder.layer.9.attention.self.query.bias', 'text_encoder.encoder.layer.9.attention.self.key.weight', 'text_encoder.encoder.layer.9.attention.self.key.bias', 'text_encoder.encoder.layer.9.attention.self.value.weight', 'text_encoder.encoder.layer.9.attention.self.value.bias', 'text_encoder.encoder.layer.9.attention.output.dense.weight', 'text_encoder.encoder.layer.9.attention.output.dense.bias', 'text_encoder.encoder.layer.9.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.9.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.9.intermediate.dense.weight', 'text_encoder.encoder.layer.9.intermediate.dense.bias', 'text_encoder.encoder.layer.9.output.dense.weight', 'text_encoder.encoder.layer.9.output.dense.bias', 'text_encoder.encoder.layer.9.output.LayerNorm.weight', 'text_encoder.encoder.layer.9.output.LayerNorm.bias', 'text_encoder.encoder.layer.10.attention.self.query.weight', 'text_encoder.encoder.layer.10.attention.self.query.bias', 'text_encoder.encoder.layer.10.attention.self.key.weight', 'text_encoder.encoder.layer.10.attention.self.key.bias', 'text_encoder.encoder.layer.10.attention.self.value.weight', 'text_encoder.encoder.layer.10.attention.self.value.bias', 'text_encoder.encoder.layer.10.attention.output.dense.weight', 'text_encoder.encoder.layer.10.attention.output.dense.bias', 'text_encoder.encoder.layer.10.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.10.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.10.intermediate.dense.weight', 'text_encoder.encoder.layer.10.intermediate.dense.bias', 'text_encoder.encoder.layer.10.output.dense.weight', 'text_encoder.encoder.layer.10.output.dense.bias', 'text_encoder.encoder.layer.10.output.LayerNorm.weight', 'text_encoder.encoder.layer.10.output.LayerNorm.bias', 'text_encoder.encoder.layer.11.attention.self.query.weight', 'text_encoder.encoder.layer.11.attention.self.query.bias', 'text_encoder.encoder.layer.11.attention.self.key.weight', 'text_encoder.encoder.layer.11.attention.self.key.bias', 'text_encoder.encoder.layer.11.attention.self.value.weight', 'text_encoder.encoder.layer.11.attention.self.value.bias', 'text_encoder.encoder.layer.11.attention.output.dense.weight', 'text_encoder.encoder.layer.11.attention.output.dense.bias', 'text_encoder.encoder.layer.11.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.11.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.11.intermediate.dense.weight', 'text_encoder.encoder.layer.11.intermediate.dense.bias', 'text_encoder.encoder.layer.11.output.dense.weight', 'text_encoder.encoder.layer.11.output.dense.bias', 'text_encoder.encoder.layer.11.output.LayerNorm.weight', 'text_encoder.encoder.layer.11.output.LayerNorm.bias', 'text_encoder.encoder.layer.12.attention.self.query.weight', 'text_encoder.encoder.layer.12.attention.self.query.bias', 'text_encoder.encoder.layer.12.attention.self.key.weight', 'text_encoder.encoder.layer.12.attention.self.key.bias', 'text_encoder.encoder.layer.12.attention.self.value.weight', 'text_encoder.encoder.layer.12.attention.self.value.bias', 'text_encoder.encoder.layer.12.attention.output.dense.weight', 'text_encoder.encoder.layer.12.attention.output.dense.bias', 'text_encoder.encoder.layer.12.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.12.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.12.intermediate.dense.weight', 'text_encoder.encoder.layer.12.intermediate.dense.bias', 'text_encoder.encoder.layer.12.output.dense.weight', 'text_encoder.encoder.layer.12.output.dense.bias', 'text_encoder.encoder.layer.12.output.LayerNorm.weight', 'text_encoder.encoder.layer.12.output.LayerNorm.bias', 'text_encoder.encoder.layer.13.attention.self.query.weight', 'text_encoder.encoder.layer.13.attention.self.query.bias', 'text_encoder.encoder.layer.13.attention.self.key.weight', 'text_encoder.encoder.layer.13.attention.self.key.bias', 'text_encoder.encoder.layer.13.attention.self.value.weight', 'text_encoder.encoder.layer.13.attention.self.value.bias', 'text_encoder.encoder.layer.13.attention.output.dense.weight', 'text_encoder.encoder.layer.13.attention.output.dense.bias', 'text_encoder.encoder.layer.13.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.13.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.13.intermediate.dense.weight', 'text_encoder.encoder.layer.13.intermediate.dense.bias', 'text_encoder.encoder.layer.13.output.dense.weight', 'text_encoder.encoder.layer.13.output.dense.bias', 'text_encoder.encoder.layer.13.output.LayerNorm.weight', 'text_encoder.encoder.layer.13.output.LayerNorm.bias', 'text_encoder.encoder.layer.14.attention.self.query.weight', 'text_encoder.encoder.layer.14.attention.self.query.bias', 'text_encoder.encoder.layer.14.attention.self.key.weight', 'text_encoder.encoder.layer.14.attention.self.key.bias', 'text_encoder.encoder.layer.14.attention.self.value.weight', 'text_encoder.encoder.layer.14.attention.self.value.bias', 'text_encoder.encoder.layer.14.attention.output.dense.weight', 'text_encoder.encoder.layer.14.attention.output.dense.bias', 'text_encoder.encoder.layer.14.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.14.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.14.intermediate.dense.weight', 'text_encoder.encoder.layer.14.intermediate.dense.bias', 'text_encoder.encoder.layer.14.output.dense.weight', 'text_encoder.encoder.layer.14.output.dense.bias', 'text_encoder.encoder.layer.14.output.LayerNorm.weight', 'text_encoder.encoder.layer.14.output.LayerNorm.bias', 'text_encoder.encoder.layer.15.attention.self.query.weight', 'text_encoder.encoder.layer.15.attention.self.query.bias', 'text_encoder.encoder.layer.15.attention.self.key.weight', 'text_encoder.encoder.layer.15.attention.self.key.bias', 'text_encoder.encoder.layer.15.attention.self.value.weight', 'text_encoder.encoder.layer.15.attention.self.value.bias', 'text_encoder.encoder.layer.15.attention.output.dense.weight', 'text_encoder.encoder.layer.15.attention.output.dense.bias', 'text_encoder.encoder.layer.15.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.15.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.15.intermediate.dense.weight', 'text_encoder.encoder.layer.15.intermediate.dense.bias', 'text_encoder.encoder.layer.15.output.dense.weight', 'text_encoder.encoder.layer.15.output.dense.bias', 'text_encoder.encoder.layer.15.output.LayerNorm.weight', 'text_encoder.encoder.layer.15.output.LayerNorm.bias', 'text_encoder.encoder.layer.16.attention.self.query.weight', 'text_encoder.encoder.layer.16.attention.self.query.bias', 'text_encoder.encoder.layer.16.attention.self.key.weight', 'text_encoder.encoder.layer.16.attention.self.key.bias', 'text_encoder.encoder.layer.16.attention.self.value.weight', 'text_encoder.encoder.layer.16.attention.self.value.bias', 'text_encoder.encoder.layer.16.attention.output.dense.weight', 'text_encoder.encoder.layer.16.attention.output.dense.bias', 'text_encoder.encoder.layer.16.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.16.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.16.intermediate.dense.weight', 'text_encoder.encoder.layer.16.intermediate.dense.bias', 'text_encoder.encoder.layer.16.output.dense.weight', 'text_encoder.encoder.layer.16.output.dense.bias', 'text_encoder.encoder.layer.16.output.LayerNorm.weight', 'text_encoder.encoder.layer.16.output.LayerNorm.bias', 'text_encoder.encoder.layer.17.attention.self.query.weight', 'text_encoder.encoder.layer.17.attention.self.query.bias', 'text_encoder.encoder.layer.17.attention.self.key.weight', 'text_encoder.encoder.layer.17.attention.self.key.bias', 'text_encoder.encoder.layer.17.attention.self.value.weight', 'text_encoder.encoder.layer.17.attention.self.value.bias', 'text_encoder.encoder.layer.17.attention.output.dense.weight', 'text_encoder.encoder.layer.17.attention.output.dense.bias', 'text_encoder.encoder.layer.17.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.17.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.17.intermediate.dense.weight', 'text_encoder.encoder.layer.17.intermediate.dense.bias', 'text_encoder.encoder.layer.17.output.dense.weight', 'text_encoder.encoder.layer.17.output.dense.bias', 'text_encoder.encoder.layer.17.output.LayerNorm.weight', 'text_encoder.encoder.layer.17.output.LayerNorm.bias', 'text_encoder.encoder.layer.18.attention.self.query.weight', 'text_encoder.encoder.layer.18.attention.self.query.bias', 'text_encoder.encoder.layer.18.attention.self.key.weight', 'text_encoder.encoder.layer.18.attention.self.key.bias', 'text_encoder.encoder.layer.18.attention.self.value.weight', 'text_encoder.encoder.layer.18.attention.self.value.bias', 'text_encoder.encoder.layer.18.attention.output.dense.weight', 'text_encoder.encoder.layer.18.attention.output.dense.bias', 'text_encoder.encoder.layer.18.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.18.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.18.intermediate.dense.weight', 'text_encoder.encoder.layer.18.intermediate.dense.bias', 'text_encoder.encoder.layer.18.output.dense.weight', 'text_encoder.encoder.layer.18.output.dense.bias', 'text_encoder.encoder.layer.18.output.LayerNorm.weight', 'text_encoder.encoder.layer.18.output.LayerNorm.bias', 'text_encoder.encoder.layer.19.attention.self.query.weight', 'text_encoder.encoder.layer.19.attention.self.query.bias', 'text_encoder.encoder.layer.19.attention.self.key.weight', 'text_encoder.encoder.layer.19.attention.self.key.bias', 'text_encoder.encoder.layer.19.attention.self.value.weight', 'text_encoder.encoder.layer.19.attention.self.value.bias', 'text_encoder.encoder.layer.19.attention.output.dense.weight', 'text_encoder.encoder.layer.19.attention.output.dense.bias', 'text_encoder.encoder.layer.19.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.19.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.19.crossattention.self.query.weight', 'text_encoder.encoder.layer.19.crossattention.self.query.bias', 'text_encoder.encoder.layer.19.crossattention.self.key.weight', 'text_encoder.encoder.layer.19.crossattention.self.key.bias', 'text_encoder.encoder.layer.19.crossattention.self.value.weight', 'text_encoder.encoder.layer.19.crossattention.self.value.bias', 'text_encoder.encoder.layer.19.crossattention.output.dense.weight', 'text_encoder.encoder.layer.19.crossattention.output.dense.bias', 'text_encoder.encoder.layer.19.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.19.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.19.intermediate.dense.weight', 'text_encoder.encoder.layer.19.intermediate.dense.bias', 'text_encoder.encoder.layer.19.output.dense.weight', 'text_encoder.encoder.layer.19.output.dense.bias', 'text_encoder.encoder.layer.19.output.LayerNorm.weight', 'text_encoder.encoder.layer.19.output.LayerNorm.bias', 'text_encoder.encoder.layer.20.attention.self.query.weight', 'text_encoder.encoder.layer.20.attention.self.query.bias', 'text_encoder.encoder.layer.20.attention.self.key.weight', 'text_encoder.encoder.layer.20.attention.self.key.bias', 'text_encoder.encoder.layer.20.attention.self.value.weight', 'text_encoder.encoder.layer.20.attention.self.value.bias', 'text_encoder.encoder.layer.20.attention.output.dense.weight', 'text_encoder.encoder.layer.20.attention.output.dense.bias', 'text_encoder.encoder.layer.20.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.20.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.20.crossattention.self.query.weight', 'text_encoder.encoder.layer.20.crossattention.self.query.bias', 'text_encoder.encoder.layer.20.crossattention.self.key.weight', 'text_encoder.encoder.layer.20.crossattention.self.key.bias', 'text_encoder.encoder.layer.20.crossattention.self.value.weight', 'text_encoder.encoder.layer.20.crossattention.self.value.bias', 'text_encoder.encoder.layer.20.crossattention.output.dense.weight', 'text_encoder.encoder.layer.20.crossattention.output.dense.bias', 'text_encoder.encoder.layer.20.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.20.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.20.intermediate.dense.weight', 'text_encoder.encoder.layer.20.intermediate.dense.bias', 'text_encoder.encoder.layer.20.output.dense.weight', 'text_encoder.encoder.layer.20.output.dense.bias', 'text_encoder.encoder.layer.20.output.LayerNorm.weight', 'text_encoder.encoder.layer.20.output.LayerNorm.bias', 'text_encoder.encoder.layer.21.attention.self.query.weight', 'text_encoder.encoder.layer.21.attention.self.query.bias', 'text_encoder.encoder.layer.21.attention.self.key.weight', 'text_encoder.encoder.layer.21.attention.self.key.bias', 'text_encoder.encoder.layer.21.attention.self.value.weight', 'text_encoder.encoder.layer.21.attention.self.value.bias', 'text_encoder.encoder.layer.21.attention.output.dense.weight', 'text_encoder.encoder.layer.21.attention.output.dense.bias', 'text_encoder.encoder.layer.21.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.21.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.21.crossattention.self.query.weight', 'text_encoder.encoder.layer.21.crossattention.self.query.bias', 'text_encoder.encoder.layer.21.crossattention.self.key.weight', 'text_encoder.encoder.layer.21.crossattention.self.key.bias', 'text_encoder.encoder.layer.21.crossattention.self.value.weight', 'text_encoder.encoder.layer.21.crossattention.self.value.bias', 'text_encoder.encoder.layer.21.crossattention.output.dense.weight', 'text_encoder.encoder.layer.21.crossattention.output.dense.bias', 'text_encoder.encoder.layer.21.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.21.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.21.intermediate.dense.weight', 'text_encoder.encoder.layer.21.intermediate.dense.bias', 'text_encoder.encoder.layer.21.output.dense.weight', 'text_encoder.encoder.layer.21.output.dense.bias', 'text_encoder.encoder.layer.21.output.LayerNorm.weight', 'text_encoder.encoder.layer.21.output.LayerNorm.bias', 'text_encoder.encoder.layer.22.attention.self.query.weight', 'text_encoder.encoder.layer.22.attention.self.query.bias', 'text_encoder.encoder.layer.22.attention.self.key.weight', 'text_encoder.encoder.layer.22.attention.self.key.bias', 'text_encoder.encoder.layer.22.attention.self.value.weight', 'text_encoder.encoder.layer.22.attention.self.value.bias', 'text_encoder.encoder.layer.22.attention.output.dense.weight', 'text_encoder.encoder.layer.22.attention.output.dense.bias', 'text_encoder.encoder.layer.22.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.22.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.22.crossattention.self.query.weight', 'text_encoder.encoder.layer.22.crossattention.self.query.bias', 'text_encoder.encoder.layer.22.crossattention.self.key.weight', 'text_encoder.encoder.layer.22.crossattention.self.key.bias', 'text_encoder.encoder.layer.22.crossattention.self.value.weight', 'text_encoder.encoder.layer.22.crossattention.self.value.bias', 'text_encoder.encoder.layer.22.crossattention.output.dense.weight', 'text_encoder.encoder.layer.22.crossattention.output.dense.bias', 'text_encoder.encoder.layer.22.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.22.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.22.intermediate.dense.weight', 'text_encoder.encoder.layer.22.intermediate.dense.bias', 'text_encoder.encoder.layer.22.output.dense.weight', 'text_encoder.encoder.layer.22.output.dense.bias', 'text_encoder.encoder.layer.22.output.LayerNorm.weight', 'text_encoder.encoder.layer.22.output.LayerNorm.bias', 'text_encoder.encoder.layer.23.attention.self.query.weight', 'text_encoder.encoder.layer.23.attention.self.query.bias', 'text_encoder.encoder.layer.23.attention.self.key.weight', 'text_encoder.encoder.layer.23.attention.self.key.bias', 'text_encoder.encoder.layer.23.attention.self.value.weight', 'text_encoder.encoder.layer.23.attention.self.value.bias', 'text_encoder.encoder.layer.23.attention.output.dense.weight', 'text_encoder.encoder.layer.23.attention.output.dense.bias', 'text_encoder.encoder.layer.23.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.23.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.23.crossattention.self.query.weight', 'text_encoder.encoder.layer.23.crossattention.self.query.bias', 'text_encoder.encoder.layer.23.crossattention.self.key.weight', 'text_encoder.encoder.layer.23.crossattention.self.key.bias', 'text_encoder.encoder.layer.23.crossattention.self.value.weight', 'text_encoder.encoder.layer.23.crossattention.self.value.bias', 'text_encoder.encoder.layer.23.crossattention.output.dense.weight', 'text_encoder.encoder.layer.23.crossattention.output.dense.bias', 'text_encoder.encoder.layer.23.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.23.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.23.intermediate.dense.weight', 'text_encoder.encoder.layer.23.intermediate.dense.bias', 'text_encoder.encoder.layer.23.output.dense.weight', 'text_encoder.encoder.layer.23.output.dense.bias', 'text_encoder.encoder.layer.23.output.LayerNorm.weight', 'text_encoder.encoder.layer.23.output.LayerNorm.bias'], unexpected_keys=['temp', 'itm_head.weight', 'itm_head.bias', 'text_encoder.bert.embeddings.position_ids', 'text_encoder.bert.embeddings.word_embeddings.weight', 'text_encoder.bert.embeddings.position_embeddings.weight', 'text_encoder.bert.embeddings.token_type_embeddings.weight', 'text_encoder.bert.embeddings.LayerNorm.weight', 'text_encoder.bert.embeddings.LayerNorm.bias', 'text_encoder.bert.encoder.layer.0.attention.self.query.weight', 'text_encoder.bert.encoder.layer.0.attention.self.query.bias', 'text_encoder.bert.encoder.layer.0.attention.self.key.weight', 'text_encoder.bert.encoder.layer.0.attention.self.key.bias', 'text_encoder.bert.encoder.layer.0.attention.self.value.weight', 'text_encoder.bert.encoder.layer.0.attention.self.value.bias', 'text_encoder.bert.encoder.layer.0.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.0.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.0.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.0.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.0.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.0.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.0.output.dense.weight', 'text_encoder.bert.encoder.layer.0.output.dense.bias', 'text_encoder.bert.encoder.layer.0.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.0.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.1.attention.self.query.weight', 'text_encoder.bert.encoder.layer.1.attention.self.query.bias', 'text_encoder.bert.encoder.layer.1.attention.self.key.weight', 'text_encoder.bert.encoder.layer.1.attention.self.key.bias', 'text_encoder.bert.encoder.layer.1.attention.self.value.weight', 'text_encoder.bert.encoder.layer.1.attention.self.value.bias', 'text_encoder.bert.encoder.layer.1.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.1.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.1.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.1.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.1.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.1.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.1.output.dense.weight', 'text_encoder.bert.encoder.layer.1.output.dense.bias', 'text_encoder.bert.encoder.layer.1.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.1.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.2.attention.self.query.weight', 'text_encoder.bert.encoder.layer.2.attention.self.query.bias', 'text_encoder.bert.encoder.layer.2.attention.self.key.weight', 'text_encoder.bert.encoder.layer.2.attention.self.key.bias', 'text_encoder.bert.encoder.layer.2.attention.self.value.weight', 'text_encoder.bert.encoder.layer.2.attention.self.value.bias', 'text_encoder.bert.encoder.layer.2.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.2.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.2.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.2.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.2.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.2.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.2.output.dense.weight', 'text_encoder.bert.encoder.layer.2.output.dense.bias', 'text_encoder.bert.encoder.layer.2.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.2.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.3.attention.self.query.weight', 'text_encoder.bert.encoder.layer.3.attention.self.query.bias', 'text_encoder.bert.encoder.layer.3.attention.self.key.weight', 'text_encoder.bert.encoder.layer.3.attention.self.key.bias', 'text_encoder.bert.encoder.layer.3.attention.self.value.weight', 'text_encoder.bert.encoder.layer.3.attention.self.value.bias', 'text_encoder.bert.encoder.layer.3.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.3.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.3.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.3.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.3.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.3.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.3.output.dense.weight', 'text_encoder.bert.encoder.layer.3.output.dense.bias', 'text_encoder.bert.encoder.layer.3.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.3.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.4.attention.self.query.weight', 'text_encoder.bert.encoder.layer.4.attention.self.query.bias', 'text_encoder.bert.encoder.layer.4.attention.self.key.weight', 'text_encoder.bert.encoder.layer.4.attention.self.key.bias', 'text_encoder.bert.encoder.layer.4.attention.self.value.weight', 'text_encoder.bert.encoder.layer.4.attention.self.value.bias', 'text_encoder.bert.encoder.layer.4.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.4.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.4.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.4.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.4.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.4.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.4.output.dense.weight', 'text_encoder.bert.encoder.layer.4.output.dense.bias', 'text_encoder.bert.encoder.layer.4.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.4.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.5.attention.self.query.weight', 'text_encoder.bert.encoder.layer.5.attention.self.query.bias', 'text_encoder.bert.encoder.layer.5.attention.self.key.weight', 'text_encoder.bert.encoder.layer.5.attention.self.key.bias', 'text_encoder.bert.encoder.layer.5.attention.self.value.weight', 'text_encoder.bert.encoder.layer.5.attention.self.value.bias', 'text_encoder.bert.encoder.layer.5.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.5.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.5.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.5.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.5.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.5.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.5.output.dense.weight', 'text_encoder.bert.encoder.layer.5.output.dense.bias', 'text_encoder.bert.encoder.layer.5.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.5.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.6.attention.self.query.weight', 'text_encoder.bert.encoder.layer.6.attention.self.query.bias', 'text_encoder.bert.encoder.layer.6.attention.self.key.weight', 'text_encoder.bert.encoder.layer.6.attention.self.key.bias', 'text_encoder.bert.encoder.layer.6.attention.self.value.weight', 'text_encoder.bert.encoder.layer.6.attention.self.value.bias', 'text_encoder.bert.encoder.layer.6.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.6.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.6.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.6.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.6.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.6.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.6.output.dense.weight', 'text_encoder.bert.encoder.layer.6.output.dense.bias', 'text_encoder.bert.encoder.layer.6.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.6.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.7.attention.self.query.weight', 'text_encoder.bert.encoder.layer.7.attention.self.query.bias', 'text_encoder.bert.encoder.layer.7.attention.self.key.weight', 'text_encoder.bert.encoder.layer.7.attention.self.key.bias', 'text_encoder.bert.encoder.layer.7.attention.self.value.weight', 'text_encoder.bert.encoder.layer.7.attention.self.value.bias', 'text_encoder.bert.encoder.layer.7.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.7.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.7.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.7.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.7.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.7.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.7.output.dense.weight', 'text_encoder.bert.encoder.layer.7.output.dense.bias', 'text_encoder.bert.encoder.layer.7.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.7.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.8.attention.self.query.weight', 'text_encoder.bert.encoder.layer.8.attention.self.query.bias', 'text_encoder.bert.encoder.layer.8.attention.self.key.weight', 'text_encoder.bert.encoder.layer.8.attention.self.key.bias', 'text_encoder.bert.encoder.layer.8.attention.self.value.weight', 'text_encoder.bert.encoder.layer.8.attention.self.value.bias', 'text_encoder.bert.encoder.layer.8.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.8.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.8.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.8.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.8.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.8.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.8.output.dense.weight', 'text_encoder.bert.encoder.layer.8.output.dense.bias', 'text_encoder.bert.encoder.layer.8.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.8.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.9.attention.self.query.weight', 'text_encoder.bert.encoder.layer.9.attention.self.query.bias', 'text_encoder.bert.encoder.layer.9.attention.self.key.weight', 'text_encoder.bert.encoder.layer.9.attention.self.key.bias', 'text_encoder.bert.encoder.layer.9.attention.self.value.weight', 'text_encoder.bert.encoder.layer.9.attention.self.value.bias', 'text_encoder.bert.encoder.layer.9.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.9.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.9.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.9.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.9.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.9.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.9.output.dense.weight', 'text_encoder.bert.encoder.layer.9.output.dense.bias', 'text_encoder.bert.encoder.layer.9.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.9.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.10.attention.self.query.weight', 'text_encoder.bert.encoder.layer.10.attention.self.query.bias', 'text_encoder.bert.encoder.layer.10.attention.self.key.weight', 'text_encoder.bert.encoder.layer.10.attention.self.key.bias', 'text_encoder.bert.encoder.layer.10.attention.self.value.weight', 'text_encoder.bert.encoder.layer.10.attention.self.value.bias', 'text_encoder.bert.encoder.layer.10.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.10.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.10.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.10.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.10.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.10.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.10.output.dense.weight', 'text_encoder.bert.encoder.layer.10.output.dense.bias', 'text_encoder.bert.encoder.layer.10.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.10.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.11.attention.self.query.weight', 'text_encoder.bert.encoder.layer.11.attention.self.query.bias', 'text_encoder.bert.encoder.layer.11.attention.self.key.weight', 'text_encoder.bert.encoder.layer.11.attention.self.key.bias', 'text_encoder.bert.encoder.layer.11.attention.self.value.weight', 'text_encoder.bert.encoder.layer.11.attention.self.value.bias', 'text_encoder.bert.encoder.layer.11.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.11.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.11.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.11.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.11.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.11.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.11.output.dense.weight', 'text_encoder.bert.encoder.layer.11.output.dense.bias', 'text_encoder.bert.encoder.layer.11.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.11.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.12.attention.self.query.weight', 'text_encoder.bert.encoder.layer.12.attention.self.query.bias', 'text_encoder.bert.encoder.layer.12.attention.self.key.weight', 'text_encoder.bert.encoder.layer.12.attention.self.key.bias', 'text_encoder.bert.encoder.layer.12.attention.self.value.weight', 'text_encoder.bert.encoder.layer.12.attention.self.value.bias', 'text_encoder.bert.encoder.layer.12.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.12.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.12.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.12.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.12.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.12.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.12.output.dense.weight', 'text_encoder.bert.encoder.layer.12.output.dense.bias', 'text_encoder.bert.encoder.layer.12.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.12.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.13.attention.self.query.weight', 'text_encoder.bert.encoder.layer.13.attention.self.query.bias', 'text_encoder.bert.encoder.layer.13.attention.self.key.weight', 'text_encoder.bert.encoder.layer.13.attention.self.key.bias', 'text_encoder.bert.encoder.layer.13.attention.self.value.weight', 'text_encoder.bert.encoder.layer.13.attention.self.value.bias', 'text_encoder.bert.encoder.layer.13.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.13.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.13.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.13.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.13.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.13.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.13.output.dense.weight', 'text_encoder.bert.encoder.layer.13.output.dense.bias', 'text_encoder.bert.encoder.layer.13.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.13.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.14.attention.self.query.weight', 'text_encoder.bert.encoder.layer.14.attention.self.query.bias', 'text_encoder.bert.encoder.layer.14.attention.self.key.weight', 'text_encoder.bert.encoder.layer.14.attention.self.key.bias', 'text_encoder.bert.encoder.layer.14.attention.self.value.weight', 'text_encoder.bert.encoder.layer.14.attention.self.value.bias', 'text_encoder.bert.encoder.layer.14.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.14.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.14.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.14.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.14.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.14.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.14.output.dense.weight', 'text_encoder.bert.encoder.layer.14.output.dense.bias', 'text_encoder.bert.encoder.layer.14.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.14.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.15.attention.self.query.weight', 'text_encoder.bert.encoder.layer.15.attention.self.query.bias', 'text_encoder.bert.encoder.layer.15.attention.self.key.weight', 'text_encoder.bert.encoder.layer.15.attention.self.key.bias', 'text_encoder.bert.encoder.layer.15.attention.self.value.weight', 'text_encoder.bert.encoder.layer.15.attention.self.value.bias', 'text_encoder.bert.encoder.layer.15.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.15.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.15.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.15.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.15.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.15.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.15.output.dense.weight', 'text_encoder.bert.encoder.layer.15.output.dense.bias', 'text_encoder.bert.encoder.layer.15.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.15.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.16.attention.self.query.weight', 'text_encoder.bert.encoder.layer.16.attention.self.query.bias', 'text_encoder.bert.encoder.layer.16.attention.self.key.weight', 'text_encoder.bert.encoder.layer.16.attention.self.key.bias', 'text_encoder.bert.encoder.layer.16.attention.self.value.weight', 'text_encoder.bert.encoder.layer.16.attention.self.value.bias', 'text_encoder.bert.encoder.layer.16.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.16.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.16.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.16.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.16.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.16.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.16.output.dense.weight', 'text_encoder.bert.encoder.layer.16.output.dense.bias', 'text_encoder.bert.encoder.layer.16.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.16.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.17.attention.self.query.weight', 'text_encoder.bert.encoder.layer.17.attention.self.query.bias', 'text_encoder.bert.encoder.layer.17.attention.self.key.weight', 'text_encoder.bert.encoder.layer.17.attention.self.key.bias', 'text_encoder.bert.encoder.layer.17.attention.self.value.weight', 'text_encoder.bert.encoder.layer.17.attention.self.value.bias', 'text_encoder.bert.encoder.layer.17.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.17.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.17.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.17.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.17.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.17.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.17.output.dense.weight', 'text_encoder.bert.encoder.layer.17.output.dense.bias', 'text_encoder.bert.encoder.layer.17.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.17.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.18.attention.self.query.weight', 'text_encoder.bert.encoder.layer.18.attention.self.query.bias', 'text_encoder.bert.encoder.layer.18.attention.self.key.weight', 'text_encoder.bert.encoder.layer.18.attention.self.key.bias', 'text_encoder.bert.encoder.layer.18.attention.self.value.weight', 'text_encoder.bert.encoder.layer.18.attention.self.value.bias', 'text_encoder.bert.encoder.layer.18.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.18.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.18.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.18.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.18.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.18.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.18.output.dense.weight', 'text_encoder.bert.encoder.layer.18.output.dense.bias', 'text_encoder.bert.encoder.layer.18.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.18.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.19.attention.self.query.weight', 'text_encoder.bert.encoder.layer.19.attention.self.query.bias', 'text_encoder.bert.encoder.layer.19.attention.self.key.weight', 'text_encoder.bert.encoder.layer.19.attention.self.key.bias', 'text_encoder.bert.encoder.layer.19.attention.self.value.weight', 'text_encoder.bert.encoder.layer.19.attention.self.value.bias', 'text_encoder.bert.encoder.layer.19.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.19.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.19.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.19.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.19.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.19.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.19.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.19.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.19.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.19.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.19.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.19.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.19.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.19.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.19.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.19.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.19.output.dense.weight', 'text_encoder.bert.encoder.layer.19.output.dense.bias', 'text_encoder.bert.encoder.layer.19.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.19.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.20.attention.self.query.weight', 'text_encoder.bert.encoder.layer.20.attention.self.query.bias', 'text_encoder.bert.encoder.layer.20.attention.self.key.weight', 'text_encoder.bert.encoder.layer.20.attention.self.key.bias', 'text_encoder.bert.encoder.layer.20.attention.self.value.weight', 'text_encoder.bert.encoder.layer.20.attention.self.value.bias', 'text_encoder.bert.encoder.layer.20.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.20.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.20.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.20.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.20.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.20.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.20.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.20.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.20.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.20.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.20.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.20.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.20.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.20.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.20.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.20.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.20.output.dense.weight', 'text_encoder.bert.encoder.layer.20.output.dense.bias', 'text_encoder.bert.encoder.layer.20.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.20.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.21.attention.self.query.weight', 'text_encoder.bert.encoder.layer.21.attention.self.query.bias', 'text_encoder.bert.encoder.layer.21.attention.self.key.weight', 'text_encoder.bert.encoder.layer.21.attention.self.key.bias', 'text_encoder.bert.encoder.layer.21.attention.self.value.weight', 'text_encoder.bert.encoder.layer.21.attention.self.value.bias', 'text_encoder.bert.encoder.layer.21.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.21.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.21.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.21.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.21.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.21.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.21.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.21.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.21.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.21.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.21.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.21.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.21.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.21.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.21.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.21.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.21.output.dense.weight', 'text_encoder.bert.encoder.layer.21.output.dense.bias', 'text_encoder.bert.encoder.layer.21.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.21.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.22.attention.self.query.weight', 'text_encoder.bert.encoder.layer.22.attention.self.query.bias', 'text_encoder.bert.encoder.layer.22.attention.self.key.weight', 'text_encoder.bert.encoder.layer.22.attention.self.key.bias', 'text_encoder.bert.encoder.layer.22.attention.self.value.weight', 'text_encoder.bert.encoder.layer.22.attention.self.value.bias', 'text_encoder.bert.encoder.layer.22.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.22.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.22.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.22.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.22.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.22.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.22.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.22.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.22.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.22.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.22.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.22.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.22.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.22.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.22.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.22.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.22.output.dense.weight', 'text_encoder.bert.encoder.layer.22.output.dense.bias', 'text_encoder.bert.encoder.layer.22.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.22.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.23.attention.self.query.weight', 'text_encoder.bert.encoder.layer.23.attention.self.query.bias', 'text_encoder.bert.encoder.layer.23.attention.self.key.weight', 'text_encoder.bert.encoder.layer.23.attention.self.key.bias', 'text_encoder.bert.encoder.layer.23.attention.self.value.weight', 'text_encoder.bert.encoder.layer.23.attention.self.value.bias', 'text_encoder.bert.encoder.layer.23.attention.output.dense.weight', 'text_encoder.bert.encoder.layer.23.attention.output.dense.bias', 'text_encoder.bert.encoder.layer.23.attention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.23.attention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.23.crossattention.self.query.weight', 'text_encoder.bert.encoder.layer.23.crossattention.self.query.bias', 'text_encoder.bert.encoder.layer.23.crossattention.self.key.weight', 'text_encoder.bert.encoder.layer.23.crossattention.self.key.bias', 'text_encoder.bert.encoder.layer.23.crossattention.self.value.weight', 'text_encoder.bert.encoder.layer.23.crossattention.self.value.bias', 'text_encoder.bert.encoder.layer.23.crossattention.output.dense.weight', 'text_encoder.bert.encoder.layer.23.crossattention.output.dense.bias', 'text_encoder.bert.encoder.layer.23.crossattention.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.23.crossattention.output.LayerNorm.bias', 'text_encoder.bert.encoder.layer.23.intermediate.dense.weight', 'text_encoder.bert.encoder.layer.23.intermediate.dense.bias', 'text_encoder.bert.encoder.layer.23.output.dense.weight', 'text_encoder.bert.encoder.layer.23.output.dense.bias', 'text_encoder.bert.encoder.layer.23.output.LayerNorm.weight', 'text_encoder.bert.encoder.layer.23.output.LayerNorm.bias', 'text_encoder.cls.predictions.bias', 'text_encoder.cls.predictions.transform.dense.weight', 'text_encoder.cls.predictions.transform.dense.bias', 'text_encoder.cls.predictions.transform.LayerNorm.weight', 'text_encoder.cls.predictions.transform.LayerNorm.bias', 'text_encoder.cls.predictions.decoder.weight', 'text_encoder.cls.predictions.decoder.bias'])\n"
     ]
    }
   ],
   "source": [
    "if 'intern_model' in locals():\n",
    "    del intern_model\n",
    "    del tokenizer\n",
    "config = Config.from_file('/home/toolkit/eai_urlb/InternVideo/InternVideo2/multi_modality/demo/internvideo2_stage2_config.py')\n",
    "config = eval_dict_leaf(config)\n",
    "config.model.vision_encoder.num_frames = 8\n",
    "config.num_frames = 8\n",
    "config.num_frames_test = 8\n",
    "config.model.text_encoder.pretrained = '/home/toolkit/.cache/huggingface/hub/models--bert-large-uncased/snapshots/6da4b6a26a1877e173fca3225479512db81a5e5b/'\n",
    "config.model.text_encoder.config = '/home/toolkit/eai_urlb/InternVideo/InternVideo2/multi_modality/' + config.model.text_encoder.config\n",
    "model_pth = '/home/toolkit/eai_urlb/InternVideo/InternVideo2/download_models/InternVideo2-stage2_1b-224p-f4.pt'\n",
    "config.pretrained_path = model_pth\n",
    "config['model']['vision_encoder']['pretrained'] = model_pth\n",
    "intern_model, tokenizer = setup_internvideo2(config)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Video tensor([0.0030], device='cuda:0')\n",
      "Text tensor([-0.0008, -0.0001, -0.0013, -0.0014,  0.0005, -0.0004, -0.0004, -0.0006,\n",
      "         0.0001, -0.0003,  0.0003,  0.0012, -0.0004,  0.0007, -0.0014, -0.0017,\n",
      "        -0.0007, -0.0018, -0.0006, -0.0024], device='cuda:0')\n",
      "text: Somebody walking ~ prob: 0.6945\n",
      "text: Playing with hat ~ prob: 0.1198\n",
      "text: A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride. ~ prob: 0.0297\n",
      "text: A person with a hat ~ prob: 0.0245\n",
      "text: A person dressed in a blue jacket shovels the snow-covered pavement outside their house. ~ prob: 0.0226\n",
      "text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.0222\n",
      "text: A group of friends playing bowling. ~ prob: 0.0212\n",
      "text: A person lying in bed ~ prob: 0.0208\n",
      "text: A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery. ~ prob: 0.0186\n",
      "text: A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees. ~ prob: 0.0102\n",
      "text: A person playing with a kid in the street ~ prob: 0.0045\n",
      "text: A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys. ~ prob: 0.0025\n",
      "text: A playful dog slides down a snowy hill, wagging its tail with delight. ~ prob: 0.0024\n",
      "text: A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees. ~ prob: 0.0015\n"
     ]
    },
    {
     "ename": "",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
      "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
      "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
      "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
     ]
    }
   ],
   "source": [
    "intern_model.eval()\n",
    "texts, probs = retrieve_text(frames, text_candidates, model=intern_model, topk=14, config=config)\n",
    "\n",
    "# Video tensor([0.0023], device='cuda:0')\n",
    "# Text tensor([-0.0008, -0.0001, -0.0013, -0.0014,  0.0005, -0.0004, -0.0004, -0.0006,\n",
    "#          0.0001, -0.0003,  0.0003,  0.0012, -0.0004,  0.0007, -0.0014, -0.0017,\n",
    "#         -0.0007, -0.0018, -0.0006], device='cuda:0')\n",
    "# text: A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery. ~ prob: 0.4592\n",
    "# text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.1335\n",
    "# text: A japanese girl eating noodles ~ prob: 0.1089\n",
    "\n",
    "for t, p in zip(texts, probs):\n",
    "    print(f'text: {t} ~ prob: {p:.4f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Parameter containing:\n",
       "tensor([[[[[-4.7913e-03, -2.1515e-03, -2.0447e-03,  ...,  3.2997e-04,\n",
       "            -3.0212e-03, -7.9727e-04],\n",
       "           [ 9.7656e-04,  2.4567e-03,  9.8419e-04,  ..., -1.8845e-03,\n",
       "             2.3193e-03,  3.6621e-03],\n",
       "           [-3.5095e-04,  2.5940e-03, -2.7618e-03,  ..., -3.7956e-04,\n",
       "            -3.1948e-05,  1.7166e-03],\n",
       "           ...,\n",
       "           [-3.8605e-03, -1.1215e-03, -9.0790e-04,  ...,  6.5994e-04,\n",
       "             1.0071e-03,  1.2894e-03],\n",
       "           [-2.2278e-03,  1.2589e-03, -1.0204e-04,  ...,  3.7079e-03,\n",
       "             1.5354e-04, -8.3160e-04],\n",
       "           [ 6.3324e-04,  1.4114e-03,  9.5367e-04,  ..., -3.4485e-03,\n",
       "            -1.8234e-03, -4.0283e-03]]],\n",
       "\n",
       "\n",
       "         [[[ 7.0190e-04, -1.3657e-03, -6.5994e-04,  ...,  1.4725e-03,\n",
       "            -8.5831e-04,  1.6212e-04],\n",
       "           [ 1.7262e-04,  8.0872e-04,  4.1485e-05,  ..., -6.4850e-04,\n",
       "             5.5695e-04,  1.7242e-03],\n",
       "           [ 1.3504e-03,  3.2959e-03, -1.3275e-03,  ...,  2.2736e-03,\n",
       "             4.2725e-04,  1.9150e-03],\n",
       "           ...,\n",
       "           [-2.3041e-03, -6.4850e-04, -2.8839e-03,  ...,  2.9755e-04,\n",
       "            -3.0518e-04,  1.2817e-03],\n",
       "           [ 9.3079e-04, -1.2512e-03, -1.5335e-03,  ...,  1.9455e-03,\n",
       "            -3.4142e-04, -1.2054e-03],\n",
       "           [ 9.1553e-03,  3.6774e-03,  2.2125e-03,  ..., -5.3883e-05,\n",
       "             3.2234e-04,  2.3499e-03]]],\n",
       "\n",
       "\n",
       "         [[[ 2.0752e-03,  7.4768e-04,  2.6512e-04,  ...,  2.3193e-03,\n",
       "            -3.3379e-04, -9.2983e-05],\n",
       "           [ 1.4725e-03,  1.0986e-03, -8.8692e-05,  ..., -2.8229e-04,\n",
       "             7.2098e-04, -2.2888e-03],\n",
       "           [ 1.3809e-03,  1.5945e-03,  6.5231e-04,  ...,  3.3112e-03,\n",
       "             2.1515e-03, -1.4114e-03],\n",
       "           ...,\n",
       "           [-1.2512e-03,  1.0605e-03,  5.6744e-05,  ..., -4.7112e-04,\n",
       "            -3.4714e-04, -1.6861e-03],\n",
       "           [-5.4550e-04,  1.1978e-03,  1.9531e-03,  ...,  7.6675e-04,\n",
       "            -1.9150e-03, -1.6937e-03],\n",
       "           [-4.5776e-03, -3.0212e-03, -1.4648e-03,  ..., -1.0757e-03,\n",
       "             1.0061e-04,  2.9449e-03]]]],\n",
       "\n",
       "\n",
       "\n",
       "        [[[[-9.9487e-03, -5.9814e-03,  3.9673e-03,  ...,  7.8125e-03,\n",
       "             4.5776e-03, -4.7607e-03],\n",
       "           [-7.5989e-03,  2.5940e-04, -6.0730e-03,  ..., -1.4725e-03,\n",
       "            -3.8300e-03, -2.4567e-03],\n",
       "           [ 6.9427e-04,  3.1090e-04, -2.1515e-03,  ..., -1.2779e-04,\n",
       "            -6.0120e-03, -1.4191e-03],\n",
       "           ...,\n",
       "           [ 1.1597e-02, -8.3447e-05, -1.3428e-03,  ..., -4.4556e-03,\n",
       "            -4.4823e-04, -1.6861e-03],\n",
       "           [ 4.3640e-03, -2.0447e-03, -1.3123e-03,  ..., -4.4556e-03,\n",
       "            -4.0283e-03, -4.6387e-03],\n",
       "           [ 9.2163e-03, -5.1880e-03,  1.3351e-03,  ...,  4.7112e-04,\n",
       "             2.6550e-03,  4.9744e-03]]],\n",
       "\n",
       "\n",
       "         [[[ 4.3030e-03, -6.3171e-03, -1.2436e-03,  ...,  2.1210e-03,\n",
       "            -9.4250e-07, -1.0559e-02],\n",
       "           [-1.2436e-03, -4.1504e-03, -9.7046e-03,  ..., -2.8687e-03,\n",
       "            -6.9885e-03, -9.7046e-03],\n",
       "           [ 6.2561e-04, -5.7678e-03, -3.9978e-03,  ..., -1.9989e-03,\n",
       "            -4.5166e-03, -5.5542e-03],\n",
       "           ...,\n",
       "           [ 7.6904e-03, -2.9144e-03, -2.0905e-03,  ..., -3.9368e-03,\n",
       "             2.1515e-03, -3.9062e-03],\n",
       "           [-6.9427e-04, -2.9907e-03, -1.2512e-03,  ...,  1.6785e-03,\n",
       "             5.8594e-03, -2.0294e-03],\n",
       "           [ 7.8678e-05, -6.0730e-03,  1.0834e-03,  ...,  2.9564e-04,\n",
       "             3.1738e-03, -8.4839e-03]]],\n",
       "\n",
       "\n",
       "         [[[ 4.2419e-03, -7.5073e-03, -2.8381e-03,  ..., -7.7515e-03,\n",
       "            -6.6223e-03,  2.1667e-03],\n",
       "           [ 8.1787e-03,  5.8899e-03,  1.0376e-03,  ..., -1.8463e-03,\n",
       "            -3.1281e-03,  5.8899e-03],\n",
       "           [ 1.7776e-03, -4.2915e-04,  8.6975e-04,  ..., -4.2915e-05,\n",
       "            -3.2043e-03,  9.5825e-03],\n",
       "           ...,\n",
       "           [ 2.7847e-04, -1.9989e-03, -5.2490e-03,  ..., -5.7068e-03,\n",
       "            -4.5776e-04, -3.5095e-03],\n",
       "           [-5.3406e-03, -6.9427e-04, -4.9133e-03,  ..., -1.0910e-03,\n",
       "            -6.4468e-04, -5.1880e-03],\n",
       "           [-7.3853e-03, -2.1210e-03,  4.7302e-03,  ...,  2.0752e-03,\n",
       "            -2.0447e-03, -1.2329e-02]]]],\n",
       "\n",
       "\n",
       "\n",
       "        [[[[ 4.4861e-03, -2.8992e-03, -4.7302e-03,  ..., -5.3406e-03,\n",
       "            -4.6692e-03, -4.6387e-03],\n",
       "           [ 1.8921e-03, -5.6458e-03, -3.7079e-03,  ..., -2.5482e-03,\n",
       "            -4.8218e-03,  2.1515e-03],\n",
       "           [ 4.5471e-03,  2.9755e-04, -3.7842e-03,  ...,  3.6774e-03,\n",
       "            -2.6550e-03, -1.8845e-03],\n",
       "           ...,\n",
       "           [ 7.2098e-04,  3.1281e-03,  2.0027e-04,  ...,  2.7924e-03,\n",
       "             1.0986e-03,  3.4943e-03],\n",
       "           [ 1.4496e-03, -2.8229e-04,  7.0801e-03,  ...,  1.0071e-03,\n",
       "            -3.9978e-03,  3.7689e-03],\n",
       "           [ 9.9945e-04,  7.3624e-04,  9.7046e-03,  ...,  3.9673e-03,\n",
       "             6.7139e-03,  1.1414e-02]]],\n",
       "\n",
       "\n",
       "         [[[-3.5400e-03, -6.9809e-04,  3.9673e-03,  ...,  7.1716e-04,\n",
       "             2.3651e-03,  1.6098e-03],\n",
       "           [-1.7319e-03,  8.0109e-04,  2.7466e-03,  ..., -1.7262e-04,\n",
       "            -1.6937e-03,  6.1340e-03],\n",
       "           [-3.9978e-03,  2.0599e-03, -2.4414e-03,  ...,  2.2888e-03,\n",
       "             2.2736e-03,  4.1809e-03],\n",
       "           ...,\n",
       "           [-6.6223e-03, -1.0529e-03, -3.0823e-03,  ...,  1.2894e-03,\n",
       "             1.7624e-03, -6.0425e-03],\n",
       "           [ 7.5531e-04, -2.0599e-03,  2.0142e-03,  ...,  3.3569e-03,\n",
       "             1.8215e-04, -7.1411e-03],\n",
       "           [ 5.5237e-03,  2.3842e-04,  7.2937e-03,  ..., -4.1809e-03,\n",
       "            -4.4861e-03, -1.7700e-02]]],\n",
       "\n",
       "\n",
       "         [[[-7.1716e-04, -1.5488e-03, -2.5635e-03,  ...,  1.8692e-03,\n",
       "             5.4016e-03,  3.8300e-03],\n",
       "           [ 1.9531e-03, -9.1934e-04,  2.0981e-05,  ..., -5.3024e-04,\n",
       "            -1.9989e-03,  1.1778e-04],\n",
       "           [ 2.5635e-03,  4.4556e-03, -3.9978e-03,  ...,  1.7548e-03,\n",
       "             1.4114e-04, -1.2817e-03],\n",
       "           ...,\n",
       "           [-1.9226e-03, -4.1389e-04, -4.2114e-03,  ...,  8.2016e-04,\n",
       "             5.0964e-03,  2.5330e-03],\n",
       "           [ 8.4229e-03,  1.8539e-03,  1.4038e-03,  ...,  2.4109e-03,\n",
       "             1.8616e-03, -1.0300e-03],\n",
       "           [-8.6784e-05, -7.3547e-03, -1.5182e-03,  ...,  1.5335e-03,\n",
       "             2.2736e-03, -8.4839e-03]]]],\n",
       "\n",
       "\n",
       "\n",
       "        ...,\n",
       "\n",
       "\n",
       "\n",
       "        [[[[-1.2207e-02,  4.0771e-02, -2.9419e-02,  ...,  6.5918e-02,\n",
       "            -2.6978e-02,  3.0640e-02],\n",
       "           [ 7.8125e-02,  4.8828e-02, -5.3955e-02,  ...,  4.3945e-02,\n",
       "            -3.3447e-02,  3.4424e-02],\n",
       "           [-1.3489e-02,  7.2021e-03, -5.0293e-02,  ..., -1.9043e-02,\n",
       "            -4.4189e-02, -3.7354e-02],\n",
       "           ...,\n",
       "           [-3.1250e-02, -1.1047e-02,  5.7617e-02,  ..., -1.9287e-02,\n",
       "             6.2500e-02, -6.5308e-03],\n",
       "           [-3.1738e-02,  5.6152e-03, -1.0986e-02,  ..., -5.6763e-03,\n",
       "             2.3804e-02,  6.2500e-02],\n",
       "           [-3.5400e-02, -4.4861e-03,  3.7109e-02,  ...,  3.3691e-02,\n",
       "            -7.1777e-02,  9.3750e-02]]],\n",
       "\n",
       "\n",
       "         [[[-1.5332e-01,  3.3203e-02, -7.8125e-02,  ...,  6.9824e-02,\n",
       "            -1.1902e-02,  6.1340e-03],\n",
       "           [ 5.1117e-04,  2.8809e-02, -7.9102e-02,  ...,  5.4932e-02,\n",
       "            -8.6670e-03,  2.2827e-02],\n",
       "           [-7.3853e-03,  6.6528e-03, -4.9561e-02,  ...,  5.6076e-04,\n",
       "            -3.0029e-02, -1.6724e-02],\n",
       "           ...,\n",
       "           [ 6.7871e-02,  5.0049e-02,  7.0801e-02,  ..., -6.1646e-03,\n",
       "             7.9102e-02, -4.5410e-02],\n",
       "           [ 3.5706e-03,  6.5308e-03, -9.0942e-03,  ..., -2.1210e-03,\n",
       "             4.9561e-02,  4.6143e-02],\n",
       "           [ 5.6152e-02,  1.8311e-02,  4.5898e-02,  ...,  2.8564e-02,\n",
       "            -9.3262e-02, -4.9316e-02]]],\n",
       "\n",
       "\n",
       "         [[[-5.1270e-02,  9.4727e-02,  2.1973e-02,  ...,  4.9072e-02,\n",
       "            -6.0547e-02,  6.9580e-03],\n",
       "           [-1.7578e-02,  1.5869e-02, -5.0293e-02,  ...,  9.4604e-03,\n",
       "            -6.9336e-02,  7.2098e-04],\n",
       "           [-6.4453e-02, -1.0620e-02, -7.0801e-02,  ...,  3.5156e-02,\n",
       "            -4.1016e-02, -3.4912e-02],\n",
       "           ...,\n",
       "           [-5.2185e-03,  3.0640e-02,  7.5195e-02,  ...,  4.1260e-02,\n",
       "             8.6426e-02, -2.6367e-02],\n",
       "           [ 2.8076e-03,  5.4626e-03,  2.0874e-02,  ...,  1.0452e-03,\n",
       "            -1.2207e-02,  2.1973e-03],\n",
       "           [-7.7148e-02, -5.0781e-02,  3.3936e-02,  ...,  1.7334e-02,\n",
       "            -1.2988e-01, -5.0781e-02]]]],\n",
       "\n",
       "\n",
       "\n",
       "        [[[[-5.6885e-02, -1.1035e-01, -2.1118e-02,  ...,  5.2002e-02,\n",
       "            -7.9346e-03, -5.3711e-02],\n",
       "           [ 2.9053e-02,  1.7944e-02, -1.0315e-02,  ...,  3.6621e-02,\n",
       "             3.3936e-02, -1.4587e-02],\n",
       "           [ 1.5259e-04, -2.6245e-02, -9.5703e-02,  ...,  9.5825e-03,\n",
       "             6.3965e-02,  2.9907e-02],\n",
       "           ...,\n",
       "           [ 3.8818e-02,  2.9907e-02,  2.7710e-02,  ..., -1.0938e-01,\n",
       "            -4.6387e-02,  5.1575e-03],\n",
       "           [-1.7212e-02,  1.6235e-02, -6.0547e-02,  ..., -2.7710e-02,\n",
       "            -5.9204e-03,  2.5024e-02],\n",
       "           [-4.1504e-02,  1.3794e-02, -8.2520e-02,  ...,  4.7852e-02,\n",
       "             8.2520e-02, -9.4238e-02]]],\n",
       "\n",
       "\n",
       "         [[[ 3.7842e-02, -9.7656e-02, -4.6143e-02,  ...,  2.5635e-02,\n",
       "            -1.6479e-02,  1.9531e-03],\n",
       "           [ 7.7148e-02,  4.0771e-02, -3.4027e-03,  ...,  1.4648e-02,\n",
       "            -2.0142e-02, -8.6670e-03],\n",
       "           [ 6.1035e-02,  6.5308e-03, -2.5635e-02,  ...,  5.0049e-02,\n",
       "             4.8828e-03, -2.4780e-02],\n",
       "           ...,\n",
       "           [ 9.9487e-03, -6.6528e-03, -1.1353e-02,  ..., -1.1572e-01,\n",
       "             1.9043e-02,  5.9082e-02],\n",
       "           [ 3.8086e-02,  4.2725e-02, -4.7363e-02,  ...,  7.5989e-03,\n",
       "             6.4087e-03,  7.3853e-03],\n",
       "           [-7.2632e-03,  9.0820e-02, -5.2979e-02,  ...,  4.4678e-02,\n",
       "             6.6895e-02, -1.0303e-01]]],\n",
       "\n",
       "\n",
       "         [[[ 1.4526e-02, -1.1475e-01, -3.8818e-02,  ...,  8.1055e-02,\n",
       "             3.8818e-02,  2.1118e-02],\n",
       "           [ 1.2354e-01,  5.1025e-02,  2.1973e-03,  ...,  1.8677e-02,\n",
       "             1.5991e-02, -3.3203e-02],\n",
       "           [ 7.7148e-02,  6.9336e-02, -3.7842e-02,  ..., -3.0151e-02,\n",
       "             3.9062e-02, -2.2217e-02],\n",
       "           ...,\n",
       "           [ 4.3701e-02,  8.2520e-02,  1.0156e-01,  ..., -8.5449e-02,\n",
       "            -3.0060e-03,  1.0547e-01],\n",
       "           [ 3.0640e-02,  8.0078e-02,  2.1606e-02,  ..., -2.0264e-02,\n",
       "             1.9287e-02,  7.8613e-02],\n",
       "           [ 4.4678e-02,  9.7168e-02, -4.9561e-02,  ...,  3.6377e-02,\n",
       "             1.3477e-01, -2.7771e-03]]]],\n",
       "\n",
       "\n",
       "\n",
       "        [[[[-2.0996e-01, -4.6387e-02, -5.6458e-03,  ...,  1.7334e-02,\n",
       "             4.6082e-03, -1.4038e-02],\n",
       "           [-2.8931e-02,  2.0020e-02, -8.7891e-03,  ..., -8.2520e-02,\n",
       "            -5.2002e-02, -1.5869e-03],\n",
       "           [ 7.1289e-02,  4.3335e-03,  1.1047e-02,  ..., -7.5684e-03,\n",
       "            -1.7456e-02,  1.5137e-02],\n",
       "           ...,\n",
       "           [ 5.0781e-02,  4.3213e-02, -8.1055e-02,  ..., -3.9062e-02,\n",
       "            -1.0693e-01, -2.9175e-02],\n",
       "           [ 6.2500e-02, -3.9062e-03, -7.8735e-03,  ..., -3.6377e-02,\n",
       "            -4.8340e-02,  4.8340e-02],\n",
       "           [-8.4473e-02, -3.3447e-02, -6.7383e-02,  ...,  4.0527e-02,\n",
       "            -6.9885e-03,  1.0547e-01]]],\n",
       "\n",
       "\n",
       "         [[[-2.2266e-01, -4.6143e-02, -2.9541e-02,  ...,  6.8054e-03,\n",
       "             3.4180e-02, -2.3682e-02],\n",
       "           [-2.4170e-02,  5.7129e-02,  4.0771e-02,  ..., -4.5898e-02,\n",
       "            -1.1536e-02,  9.0942e-03],\n",
       "           [ 7.8613e-02,  4.6631e-02,  9.9182e-04,  ...,  4.2236e-02,\n",
       "             2.5879e-02,  4.2236e-02],\n",
       "           ...,\n",
       "           [ 5.7129e-02,  3.1250e-02, -7.7148e-02,  ..., -2.1362e-02,\n",
       "            -2.8809e-02, -6.3171e-03],\n",
       "           [ 1.0352e-01,  6.2988e-02, -1.6602e-02,  ..., -3.4668e-02,\n",
       "            -5.9128e-04,  6.2988e-02],\n",
       "           [-1.0193e-02,  5.0537e-02, -1.6479e-02,  ...,  3.1738e-02,\n",
       "            -4.3945e-02, -2.5146e-02]]],\n",
       "\n",
       "\n",
       "         [[[ 5.5176e-02, -2.3804e-02, -2.0020e-02,  ..., -5.1270e-03,\n",
       "            -5.8899e-03, -2.4414e-02],\n",
       "           [ 5.4199e-02,  4.0894e-03,  8.1787e-03,  ..., -1.5320e-02,\n",
       "             1.3885e-03,  3.7842e-02],\n",
       "           [ 1.0938e-01,  5.5847e-03, -1.3184e-02,  ...,  4.1260e-02,\n",
       "             2.1484e-02,  5.2734e-02],\n",
       "           ...,\n",
       "           [-4.3297e-04, -1.1169e-02, -8.5449e-02,  ...,  2.0264e-02,\n",
       "            -4.7363e-02, -3.6774e-03],\n",
       "           [ 5.5847e-03, -1.7212e-02, -4.9805e-02,  ..., -7.6660e-02,\n",
       "            -2.7466e-02,  4.8340e-02],\n",
       "           [-9.3750e-02, -3.6377e-02, -6.6895e-02,  ..., -3.0029e-02,\n",
       "            -5.8350e-02, -6.1768e-02]]]]], device='cuda:0')"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "Parameter containing:\n",
    "tensor([[[[[-4.7913e-03, -2.1515e-03, -2.0447e-03,  ...,  3.2997e-04,\n",
    "            -3.0212e-03, -7.9727e-04],\n",
    "           [ 9.7656e-04,  2.4567e-03,  9.8419e-04,  ..., -1.8845e-03,\n",
    "             2.3193e-03,  3.6621e-03],\n",
    "           [-3.5095e-04,  2.5940e-03, -2.7618e-03,  ..., -3.7956e-04,\n",
    "            -3.1948e-05,  1.7166e-03],\n",
    "           ...,\n",
    "           [-3.8605e-03, -1.1215e-03, -9.0790e-04,  ...,  6.5994e-04,\n",
    "             1.0071e-03,  1.2894e-03],\n",
    "           [-2.2278e-03,  1.2589e-03, -1.0204e-04,  ...,  3.7079e-03,\n",
    "             1.5354e-04, -8.3160e-04],\n",
    "           [ 6.3324e-04,  1.4114e-03,  9.5367e-04,  ..., -3.4485e-03,\n",
    "            -1.8234e-03, -4.0283e-03]]],\n",
    "\"\"\"\n",
    "intern_model.vision_encoder.patch_embed.proj.weight\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Parameter containing:\n",
       "tensor([[ 0.0366,  0.0135,  0.0492,  ..., -0.0274,  0.0493,  0.0242],\n",
       "        [-0.0113,  0.0504,  0.0469,  ..., -0.0269, -0.0224, -0.0305],\n",
       "        [ 0.0192, -0.0152,  0.0119,  ...,  0.0115, -0.0182, -0.0063],\n",
       "        ...,\n",
       "        [-0.0370, -0.0460,  0.0203,  ...,  0.0157, -0.0529,  0.0139],\n",
       "        [-0.0523, -0.0192, -0.0612,  ..., -0.0515,  0.0169,  0.0098],\n",
       "        [ 0.0277, -0.0029, -0.0349,  ...,  0.0014, -0.0453,  0.0052]],\n",
       "       device='cuda:0')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "Parameter containing:\n",
    "tensor([[ 0.0366,  0.0135,  0.0492,  ..., -0.0274,  0.0493,  0.0242],\n",
    "        [-0.0113,  0.0504,  0.0469,  ..., -0.0269, -0.0224, -0.0305],\n",
    "        [ 0.0192, -0.0152,  0.0119,  ...,  0.0115, -0.0182, -0.0063],\n",
    "        ...,\n",
    "        [-0.0370, -0.0460,  0.0203,  ...,  0.0157, -0.0529,  0.0139],\n",
    "        [-0.0523, -0.0192, -0.0612,  ..., -0.0515,  0.0169,  0.0098],\n",
    "        [ 0.0277, -0.0029, -0.0349,  ...,  0.0014, -0.0453,  0.0052]],\n",
    "       device='cuda:0')\n",
    "\"\"\"\n",
    "intern_model.text_encoder.encoder.layer[0].output.dense.weight"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Video tensor([0.0005], device='cuda:0')\n",
    "# Text tensor([-1.1985e-03,  5.7084e-04, -7.3242e-05, -2.1923e-04,  1.3280e-03,\n",
    "#          6.7617e-05, -5.6482e-04,  1.3007e-03,  \n",
    "#      9.1326e-04,  5.7684e-04],\n",
    "#        device='cuda:0')\n",
    "# text: A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees. ~ prob: 0.5572\n",
    "# text: A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys. ~ prob: 0.1044\n",
    "# text: A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon. ~ prob: 0.0958\n",
    "# text: A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride. ~ prob: 0.0936\n",
    "# text: A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run. ~ prob: 0.0404"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# /home/toolkit/.conda/envs/urlb_test/lib/python3.8/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
    "#   warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n",
    "# Video tensor([-0.0014], device='cuda:0')\n",
    "# Text tensor([-1.8553e-03, -1.8098e-03,  5.9901e-04, -1.9457e-03,  4.7702e-05,\n",
    "#         -2.8283e-03, -2.2676e-03,  7.7966e-04, -2.1556e-04, -3.8074e-04],\n",
    "#        device='cuda:0')\n",
    "# text: A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride. ~ prob: 0.3186\n",
    "# text: A playful dog slides down a snowy hill, wagging its tail with delight. ~ prob: 0.1871\n",
    "# text: A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner. ~ prob: 0.1405\n",
    "# text: A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys. ~ prob: 0.1344\n",
    "# text: A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run. ~ prob: 0.0955"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.18"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}