WEBing commited on
Commit
cc4b78f
1 Parent(s): 7d6a6dd

update example

Browse files
Files changed (2) hide show
  1. README.md +8 -9
  2. vision_tower_builder.py +2 -12
README.md CHANGED
@@ -8,7 +8,7 @@ pipeline_tag: visual-question-answering
8
  # Kangaroo: A Powerful Video-Language Model Supporting Long-context Video Input
9
 
10
  ## Release
11
- - [2024/07/17] 🔥 **Kangaroo** has been released. We release [blog](https://kangaroogroup.github.io/Kangaroo.github.io/) and [model](https://huggingface.co/KangarooGroup/kangaroo). Please check out the blog for details.
12
 
13
  ## Abstract
14
  We introduce <strong>Kangaroo</strong>, a powerful Multimodal Large Language Model designed for long-context video understanding. Our presented Kangaroo model shows remarkable performance across diverse video understanding tasks including video caption, QA and conversation. Generally, our key contributions in this work can be summarized as follows:
@@ -35,18 +35,18 @@ We introduce <strong>Kangaroo</strong>, a powerful Multimodal Large Language Mod
35
  import torch
36
  from transformers import AutoTokenizer, AutoModelForCausalLM
37
 
38
- tokenizer = AutoTokenizer.from_pretrained("KangarooGroup/kangaroo")
39
  model = AutoModelForCausalLM.from_pretrained(
40
- "KangarooGroup/kangaroo",
41
  torch_dtype=torch.bfloat16,
42
  trust_remote_code=True,
43
  )
44
  model = model.to("cuda")
45
  terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
 
46
 
47
  # Round 1
48
- video_path = "path/to/video"
49
- query = "Please describe this video"
50
  out, history = model.chat(video_path=video_path,
51
  query=query,
52
  tokenizer=tokenizer,
@@ -55,20 +55,19 @@ out, history = model.chat(video_path=video_path,
55
  do_sample=True,
56
  temperature=0.6,
57
  top_p=0.9,)
58
- print('Assistant: ', out)
59
 
60
  # Round 2
61
- query = "What happend at the end of the video?"
62
  out, history = model.chat(video_path=video_path,
63
  query=query,
64
- history=history,
65
  tokenizer=tokenizer,
66
  max_new_tokens=512,
67
  eos_token_id=terminators,
68
  do_sample=True,
69
  temperature=0.6,
70
  top_p=0.9,)
71
- print('Assistant: ', out)
72
  ```
73
 
74
  ## Citation
 
8
  # Kangaroo: A Powerful Video-Language Model Supporting Long-context Video Input
9
 
10
  ## Release
11
+ - [2024/07/17] 🔥 **Kangaroo** has been released. Please check out our [blog](https://kangaroogroup.github.io/Kangaroo.github.io/) and [github](https://github.com/KangarooGroup/Kangaroo) for details.
12
 
13
  ## Abstract
14
  We introduce <strong>Kangaroo</strong>, a powerful Multimodal Large Language Model designed for long-context video understanding. Our presented Kangaroo model shows remarkable performance across diverse video understanding tasks including video caption, QA and conversation. Generally, our key contributions in this work can be summarized as follows:
 
35
  import torch
36
  from transformers import AutoTokenizer, AutoModelForCausalLM
37
 
38
+ tokenizer = AutoTokenizer.from_pretrained(".")
39
  model = AutoModelForCausalLM.from_pretrained(
40
+ ".",
41
  torch_dtype=torch.bfloat16,
42
  trust_remote_code=True,
43
  )
44
  model = model.to("cuda")
45
  terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
46
+ video_path = "/path/to/video"
47
 
48
  # Round 1
49
+ query = "Please input your first question."
 
50
  out, history = model.chat(video_path=video_path,
51
  query=query,
52
  tokenizer=tokenizer,
 
55
  do_sample=True,
56
  temperature=0.6,
57
  top_p=0.9,)
58
+ print('Assitant: \n', out)
59
 
60
  # Round 2
61
+ query = "Please input your second question."
62
  out, history = model.chat(video_path=video_path,
63
  query=query,
 
64
  tokenizer=tokenizer,
65
  max_new_tokens=512,
66
  eos_token_id=terminators,
67
  do_sample=True,
68
  temperature=0.6,
69
  top_p=0.9,)
70
+ print('Assitant: \n', out)
71
  ```
72
 
73
  ## Citation
vision_tower_builder.py CHANGED
@@ -562,24 +562,15 @@ class EVAVisionTransformer(nn.Module):
562
  init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None,
563
  xattn=xattn, rope=self.rope, postnorm=postnorm, subln=subln, naiveswiglu=naiveswiglu)
564
  for i in range(depth)])
565
- #self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
566
- #self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
567
- #self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
568
 
569
  if self.pos_embed is not None:
570
  trunc_normal_(self.pos_embed, std=.02)
571
 
572
  trunc_normal_(self.cls_token, std=.02)
573
- # trunc_normal_(self.mask_token, std=.02)
574
 
575
  self.apply(self._init_weights)
576
  self.fix_init_weight()
577
 
578
- #if isinstance(self.head, nn.Linear):
579
- # trunc_normal_(self.head.weight, std=.02)
580
- # self.head.weight.data.mul_(init_scale)
581
- # self.head.bias.data.mul_(init_scale)
582
-
583
  # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
584
  self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
585
 
@@ -736,7 +727,7 @@ def build_vision_tower(
736
  img_size = vision_cfg.image_size,
737
  patch_size = vision_cfg.patch_size,
738
  num_classes = model_cfg['embed_dim'],
739
- use_mean_pooling = vision_cfg.global_average_pool, #False
740
  init_values = vision_cfg.ls_init_value,
741
  patch_dropout = vision_cfg.patch_dropout,
742
  embed_dim = vision_cfg.width,
@@ -749,7 +740,7 @@ def build_vision_tower(
749
  xattn = vision_cfg.xattn,
750
  rope = vision_cfg.rope,
751
  postnorm = vision_cfg.postnorm,
752
- pt_hw_seq_len = vision_cfg.pt_hw_seq_len, # 224/14
753
  intp_freq = vision_cfg.intp_freq,
754
  naiveswiglu = vision_cfg.naiveswiglu,
755
  subln = vision_cfg.subln
@@ -761,7 +752,6 @@ def build_vision_tower(
761
 
762
  vision_tower.to(device=device)
763
 
764
- # set image / mean metadata from pretrained_cfg if available, or use default
765
  vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
766
  vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)
767
 
 
562
  init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None,
563
  xattn=xattn, rope=self.rope, postnorm=postnorm, subln=subln, naiveswiglu=naiveswiglu)
564
  for i in range(depth)])
 
 
 
565
 
566
  if self.pos_embed is not None:
567
  trunc_normal_(self.pos_embed, std=.02)
568
 
569
  trunc_normal_(self.cls_token, std=.02)
 
570
 
571
  self.apply(self._init_weights)
572
  self.fix_init_weight()
573
 
 
 
 
 
 
574
  # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
575
  self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()
576
 
 
727
  img_size = vision_cfg.image_size,
728
  patch_size = vision_cfg.patch_size,
729
  num_classes = model_cfg['embed_dim'],
730
+ use_mean_pooling = vision_cfg.global_average_pool,
731
  init_values = vision_cfg.ls_init_value,
732
  patch_dropout = vision_cfg.patch_dropout,
733
  embed_dim = vision_cfg.width,
 
740
  xattn = vision_cfg.xattn,
741
  rope = vision_cfg.rope,
742
  postnorm = vision_cfg.postnorm,
743
+ pt_hw_seq_len = vision_cfg.pt_hw_seq_len,
744
  intp_freq = vision_cfg.intp_freq,
745
  naiveswiglu = vision_cfg.naiveswiglu,
746
  subln = vision_cfg.subln
 
752
 
753
  vision_tower.to(device=device)
754
 
 
755
  vision_tower.image_mean = (0.48145466, 0.4578275, 0.40821073)
756
  vision_tower.image_std = (0.26862954, 0.26130258, 0.27577711)
757