KangarooGroup
/

kangaroo

@@ -8,7 +8,7 @@ pipeline_tag: visual-question-answering
 # Kangaroo: A Powerful Video-Language Model Supporting Long-context Video Input
 ## Release
-- [2024/07/17] 🔥 **Kangaroo** has been released. We release [blog](https://kangaroogroup.github.io/Kangaroo.github.io/) and [model](https://huggingface.co/KangarooGroup/kangaroo). Please check out the blog for details.
 ## Abstract
 We introduce <strong>Kangaroo</strong>, a powerful Multimodal Large Language Model designed for long-context video understanding. Our presented Kangaroo model shows remarkable performance across diverse video understanding tasks including video caption, QA and conversation. Generally, our key contributions in this work can be summarized as follows:
@@ -35,18 +35,18 @@ We introduce <strong>Kangaroo</strong>, a powerful Multimodal Large Language Mod
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
-tokenizer = AutoTokenizer.from_pretrained("KangarooGroup/kangaroo")
 model = AutoModelForCausalLM.from_pretrained(
-    "KangarooGroup/kangaroo",
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,
 )
 model = model.to("cuda")
 terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
 # Round 1
-video_path = "path/to/video"
-query = "Please describe this video"
 out, history = model.chat(video_path=video_path,
                           query=query,
                           tokenizer=tokenizer,
@@ -55,20 +55,19 @@ out, history = model.chat(video_path=video_path,
                           do_sample=True,
                           temperature=0.6,
                           top_p=0.9,)
-print('Assistant: ', out)
 # Round 2
-query = "What happend at the end of the video?"
 out, history = model.chat(video_path=video_path,
                           query=query,
-                          history=history,
                           tokenizer=tokenizer,
                           max_new_tokens=512,
                           eos_token_id=terminators,
                           do_sample=True,
                           temperature=0.6,
                           top_p=0.9,)
-print('Assistant: ', out)
 ```
 ## Citation

 # Kangaroo: A Powerful Video-Language Model Supporting Long-context Video Input
 ## Release
+- [2024/07/17] 🔥 **Kangaroo** has been released. Please check out our [blog](https://kangaroogroup.github.io/Kangaroo.github.io/) and [github](https://github.com/KangarooGroup/Kangaroo) for details.
 ## Abstract
 We introduce <strong>Kangaroo</strong>, a powerful Multimodal Large Language Model designed for long-context video understanding. Our presented Kangaroo model shows remarkable performance across diverse video understanding tasks including video caption, QA and conversation. Generally, our key contributions in this work can be summarized as follows:
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
+tokenizer = AutoTokenizer.from_pretrained(".")
 model = AutoModelForCausalLM.from_pretrained(
+    ".",
     torch_dtype=torch.bfloat16,
     trust_remote_code=True,
 )
 model = model.to("cuda")
 terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+video_path = "/path/to/video"
 # Round 1
+query = "Please input your first question."
 out, history = model.chat(video_path=video_path,
                           query=query,
                           tokenizer=tokenizer,
                           do_sample=True,
                           temperature=0.6,
                           top_p=0.9,)
+print('Assitant: \n', out)
 # Round 2
+query = "Please input your second question."
 out, history = model.chat(video_path=video_path,
                           query=query,
                           tokenizer=tokenizer,
                           max_new_tokens=512,
                           eos_token_id=terminators,
                           do_sample=True,
                           temperature=0.6,
                           top_p=0.9,)
+print('Assitant: \n', out)
 ```
 ## Citation

vision_tower_builder.py CHANGED Viewed

@@ -562,24 +562,15 @@ class EVAVisionTransformer(nn.Module):
                 init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None,
                 xattn=xattn, rope=self.rope, postnorm=postnorm, subln=subln, naiveswiglu=naiveswiglu)
             for i in range(depth)])
-        #self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
-        #self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
-        #self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
         if self.pos_embed is not None:
             trunc_normal_(self.pos_embed, std=.02)
         trunc_normal_(self.cls_token, std=.02)
-        # trunc_normal_(self.mask_token, std=.02)
         self.apply(self._init_weights)
         self.fix_init_weight()
-        #if isinstance(self.head, nn.Linear):
-        #    trunc_normal_(self.head.weight, std=.02)
-        #    self.head.weight.data.mul_(init_scale)
-        #    self.head.bias.data.mul_(init_scale)
         # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
         self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
@@ -736,7 +727,7 @@ def build_vision_tower(
         img_size = vision_cfg.image_size,
         patch_size = vision_cfg.patch_size,
         num_classes = model_cfg['embed_dim'],
-        use_mean_pooling = vision_cfg.global_average_pool, #False
         init_values = vision_cfg.ls_init_value,
         patch_dropout = vision_cfg.patch_dropout,
         embed_dim = vision_cfg.width,
@@ -749,7 +740,7 @@ def build_vision_tower(
         xattn = vision_cfg.xattn,
         rope = vision_cfg.rope,
         postnorm = vision_cfg.postnorm,
-        pt_hw_seq_len = vision_cfg.pt_hw_seq_len,   # 224/14
         intp_freq = vision_cfg.intp_freq,
         naiveswiglu = vision_cfg.naiveswiglu,
         subln = vision_cfg.subln
@@ -761,7 +752,6 @@ def build_vision_tower(
     vision_tower.to(device=device)
-    # set image / mean metadata from pretrained_cfg if available, or use default
     vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
     vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)

                 init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None,
                 xattn=xattn, rope=self.rope, postnorm=postnorm, subln=subln, naiveswiglu=naiveswiglu)
             for i in range(depth)])
         if self.pos_embed is not None:
             trunc_normal_(self.pos_embed, std=.02)
         trunc_normal_(self.cls_token, std=.02)
         self.apply(self._init_weights)
         self.fix_init_weight()
         # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
         self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
         img_size = vision_cfg.image_size,
         patch_size = vision_cfg.patch_size,
         num_classes = model_cfg['embed_dim'],
+        use_mean_pooling = vision_cfg.global_average_pool,
         init_values = vision_cfg.ls_init_value,
         patch_dropout = vision_cfg.patch_dropout,
         embed_dim = vision_cfg.width,
         xattn = vision_cfg.xattn,
         rope = vision_cfg.rope,
         postnorm = vision_cfg.postnorm,
+        pt_hw_seq_len = vision_cfg.pt_hw_seq_len,
         intp_freq = vision_cfg.intp_freq,
         naiveswiglu = vision_cfg.naiveswiglu,
         subln = vision_cfg.subln
     vision_tower.to(device=device)
     vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
     vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)