update example
Browse files- README.md +8 -9
- vision_tower_builder.py +2 -12
README.md
CHANGED
@@ -8,7 +8,7 @@ pipeline_tag: visual-question-answering
|
|
8 |
# Kangaroo: A Powerful Video-Language Model Supporting Long-context Video Input
|
9 |
|
10 |
## Release
|
11 |
-
- [2024/07/17] 🔥 **Kangaroo** has been released.
|
12 |
|
13 |
## Abstract
|
14 |
We introduce <strong>Kangaroo</strong>, a powerful Multimodal Large Language Model designed for long-context video understanding. Our presented Kangaroo model shows remarkable performance across diverse video understanding tasks including video caption, QA and conversation. Generally, our key contributions in this work can be summarized as follows:
|
@@ -35,18 +35,18 @@ We introduce <strong>Kangaroo</strong>, a powerful Multimodal Large Language Mod
|
|
35 |
import torch
|
36 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
37 |
|
38 |
-
tokenizer = AutoTokenizer.from_pretrained("
|
39 |
model = AutoModelForCausalLM.from_pretrained(
|
40 |
-
"
|
41 |
torch_dtype=torch.bfloat16,
|
42 |
trust_remote_code=True,
|
43 |
)
|
44 |
model = model.to("cuda")
|
45 |
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
|
|
|
46 |
|
47 |
# Round 1
|
48 |
-
|
49 |
-
query = "Please describe this video"
|
50 |
out, history = model.chat(video_path=video_path,
|
51 |
query=query,
|
52 |
tokenizer=tokenizer,
|
@@ -55,20 +55,19 @@ out, history = model.chat(video_path=video_path,
|
|
55 |
do_sample=True,
|
56 |
temperature=0.6,
|
57 |
top_p=0.9,)
|
58 |
-
print('
|
59 |
|
60 |
# Round 2
|
61 |
-
query = "
|
62 |
out, history = model.chat(video_path=video_path,
|
63 |
query=query,
|
64 |
-
history=history,
|
65 |
tokenizer=tokenizer,
|
66 |
max_new_tokens=512,
|
67 |
eos_token_id=terminators,
|
68 |
do_sample=True,
|
69 |
temperature=0.6,
|
70 |
top_p=0.9,)
|
71 |
-
print('
|
72 |
```
|
73 |
|
74 |
## Citation
|
|
|
8 |
# Kangaroo: A Powerful Video-Language Model Supporting Long-context Video Input
|
9 |
|
10 |
## Release
|
11 |
+
- [2024/07/17] 🔥 **Kangaroo** has been released. Please check out our [blog](https://kangaroogroup.github.io/Kangaroo.github.io/) and [github](https://github.com/KangarooGroup/Kangaroo) for details.
|
12 |
|
13 |
## Abstract
|
14 |
We introduce <strong>Kangaroo</strong>, a powerful Multimodal Large Language Model designed for long-context video understanding. Our presented Kangaroo model shows remarkable performance across diverse video understanding tasks including video caption, QA and conversation. Generally, our key contributions in this work can be summarized as follows:
|
|
|
35 |
import torch
|
36 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
37 |
|
38 |
+
tokenizer = AutoTokenizer.from_pretrained(".")
|
39 |
model = AutoModelForCausalLM.from_pretrained(
|
40 |
+
".",
|
41 |
torch_dtype=torch.bfloat16,
|
42 |
trust_remote_code=True,
|
43 |
)
|
44 |
model = model.to("cuda")
|
45 |
terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
|
46 |
+
video_path = "/path/to/video"
|
47 |
|
48 |
# Round 1
|
49 |
+
query = "Please input your first question."
|
|
|
50 |
out, history = model.chat(video_path=video_path,
|
51 |
query=query,
|
52 |
tokenizer=tokenizer,
|
|
|
55 |
do_sample=True,
|
56 |
temperature=0.6,
|
57 |
top_p=0.9,)
|
58 |
+
print('Assitant: \n', out)
|
59 |
|
60 |
# Round 2
|
61 |
+
query = "Please input your second question."
|
62 |
out, history = model.chat(video_path=video_path,
|
63 |
query=query,
|
|
|
64 |
tokenizer=tokenizer,
|
65 |
max_new_tokens=512,
|
66 |
eos_token_id=terminators,
|
67 |
do_sample=True,
|
68 |
temperature=0.6,
|
69 |
top_p=0.9,)
|
70 |
+
print('Assitant: \n', out)
|
71 |
```
|
72 |
|
73 |
## Citation
|
vision_tower_builder.py
CHANGED
@@ -562,24 +562,15 @@ class EVAVisionTransformer(nn.Module):
|
|
562 |
init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None,
|
563 |
xattn=xattn, rope=self.rope, postnorm=postnorm, subln=subln, naiveswiglu=naiveswiglu)
|
564 |
for i in range(depth)])
|
565 |
-
#self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
|
566 |
-
#self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
|
567 |
-
#self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
|
568 |
|
569 |
if self.pos_embed is not None:
|
570 |
trunc_normal_(self.pos_embed, std=.02)
|
571 |
|
572 |
trunc_normal_(self.cls_token, std=.02)
|
573 |
-
# trunc_normal_(self.mask_token, std=.02)
|
574 |
|
575 |
self.apply(self._init_weights)
|
576 |
self.fix_init_weight()
|
577 |
|
578 |
-
#if isinstance(self.head, nn.Linear):
|
579 |
-
# trunc_normal_(self.head.weight, std=.02)
|
580 |
-
# self.head.weight.data.mul_(init_scale)
|
581 |
-
# self.head.bias.data.mul_(init_scale)
|
582 |
-
|
583 |
# setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
|
584 |
self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
|
585 |
|
@@ -736,7 +727,7 @@ def build_vision_tower(
|
|
736 |
img_size = vision_cfg.image_size,
|
737 |
patch_size = vision_cfg.patch_size,
|
738 |
num_classes = model_cfg['embed_dim'],
|
739 |
-
use_mean_pooling = vision_cfg.global_average_pool,
|
740 |
init_values = vision_cfg.ls_init_value,
|
741 |
patch_dropout = vision_cfg.patch_dropout,
|
742 |
embed_dim = vision_cfg.width,
|
@@ -749,7 +740,7 @@ def build_vision_tower(
|
|
749 |
xattn = vision_cfg.xattn,
|
750 |
rope = vision_cfg.rope,
|
751 |
postnorm = vision_cfg.postnorm,
|
752 |
-
pt_hw_seq_len = vision_cfg.pt_hw_seq_len,
|
753 |
intp_freq = vision_cfg.intp_freq,
|
754 |
naiveswiglu = vision_cfg.naiveswiglu,
|
755 |
subln = vision_cfg.subln
|
@@ -761,7 +752,6 @@ def build_vision_tower(
|
|
761 |
|
762 |
vision_tower.to(device=device)
|
763 |
|
764 |
-
# set image / mean metadata from pretrained_cfg if available, or use default
|
765 |
vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
|
766 |
vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)
|
767 |
|
|
|
562 |
init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None,
|
563 |
xattn=xattn, rope=self.rope, postnorm=postnorm, subln=subln, naiveswiglu=naiveswiglu)
|
564 |
for i in range(depth)])
|
|
|
|
|
|
|
565 |
|
566 |
if self.pos_embed is not None:
|
567 |
trunc_normal_(self.pos_embed, std=.02)
|
568 |
|
569 |
trunc_normal_(self.cls_token, std=.02)
|
|
|
570 |
|
571 |
self.apply(self._init_weights)
|
572 |
self.fix_init_weight()
|
573 |
|
|
|
|
|
|
|
|
|
|
|
574 |
# setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
|
575 |
self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
|
576 |
|
|
|
727 |
img_size = vision_cfg.image_size,
|
728 |
patch_size = vision_cfg.patch_size,
|
729 |
num_classes = model_cfg['embed_dim'],
|
730 |
+
use_mean_pooling = vision_cfg.global_average_pool,
|
731 |
init_values = vision_cfg.ls_init_value,
|
732 |
patch_dropout = vision_cfg.patch_dropout,
|
733 |
embed_dim = vision_cfg.width,
|
|
|
740 |
xattn = vision_cfg.xattn,
|
741 |
rope = vision_cfg.rope,
|
742 |
postnorm = vision_cfg.postnorm,
|
743 |
+
pt_hw_seq_len = vision_cfg.pt_hw_seq_len,
|
744 |
intp_freq = vision_cfg.intp_freq,
|
745 |
naiveswiglu = vision_cfg.naiveswiglu,
|
746 |
subln = vision_cfg.subln
|
|
|
752 |
|
753 |
vision_tower.to(device=device)
|
754 |
|
|
|
755 |
vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
|
756 |
vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)
|
757 |
|