michaelryoo commited on
Commit
05ce84e
·
verified ·
1 Parent(s): cfabfd0

Upload model

Browse files
Files changed (4) hide show
  1. README.md +1 -1
  2. config.json +3 -3
  3. generation_config.json +1 -1
  4. modeling_xgenmm.py +9 -4
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
- license: cc-by-nc-4.0
3
  language:
4
  - en
 
5
  pipeline_tag: image-text-to-text
6
  ---
7
 
 
1
  ---
 
2
  language:
3
  - en
4
+ license: cc-by-nc-4.0
5
  pipeline_tag: image-text-to-text
6
  ---
7
 
config.json CHANGED
@@ -14,14 +14,14 @@
14
  "torch_dtype": "bfloat16"
15
  },
16
  "torch_dtype": "float32",
17
- "transformers_version": "4.47.0",
18
  "vision_encoder_config": {
19
  "anyres_patch_sampling": false,
20
  "image_aspect_ratio": "pad",
21
- "model_type": "xgenmm_vision_encoder"
 
22
  },
23
  "vision_tokenizer_config": {
24
- "_attn_implementation_autoset": true,
25
  "model_type": "xgenmm_vision_tokenizer"
26
  }
27
  }
 
14
  "torch_dtype": "bfloat16"
15
  },
16
  "torch_dtype": "float32",
17
+ "transformers_version": "4.41.2",
18
  "vision_encoder_config": {
19
  "anyres_patch_sampling": false,
20
  "image_aspect_ratio": "pad",
21
+ "model_type": "xgenmm_vision_encoder",
22
+ "temporal_encoder_mode": "gttm"
23
  },
24
  "vision_tokenizer_config": {
 
25
  "model_type": "xgenmm_vision_tokenizer"
26
  }
27
  }
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "bos_token_id": 1,
4
  "eos_token_id": 32000,
5
  "pad_token_id": 32000,
6
- "transformers_version": "4.47.0"
7
  }
 
3
  "bos_token_id": 1,
4
  "eos_token_id": 32000,
5
  "pad_token_id": 32000,
6
+ "transformers_version": "4.41.2"
7
  }
modeling_xgenmm.py CHANGED
@@ -52,12 +52,14 @@ class XGenMMVisionTokenizerConfig(PretrainedConfig):
52
  lang_embedding_dim: int = 3072,
53
  num_vis_tokens: int = 128,
54
  image_aspect_ratio: str = "anyres",
 
55
  **kwargs,
56
  ):
57
  self.vis_feature_dim = vis_feature_dim
58
  self.lang_embedding_dim = lang_embedding_dim
59
  self.num_vis_tokens = num_vis_tokens
60
  self.image_aspect_ratio = image_aspect_ratio
 
61
  super().__init__(**kwargs)
62
 
63
 
@@ -76,6 +78,7 @@ class XGenMMConfig(PretrainedConfig):
76
  vision_encoder_config = {
77
  "image_aspect_ratio": "pad",
78
  "anyres_patch_sampling": False,
 
79
  }
80
  logger.info(
81
  "vision_encoder_config is None. initializing the XGenMMVisionEncoderConfig with default values."
@@ -1034,7 +1037,7 @@ class PerceiverResampler(VisionTokenizer):
1034
  max_num_media=None,
1035
  max_num_frames=None,
1036
  ff_mult=4,
1037
- video_mode='gttm',
1038
  ):
1039
  """
1040
  Perceiver module which takes in image features and outputs image tokens.
@@ -1087,11 +1090,12 @@ class PerceiverResampler(VisionTokenizer):
1087
 
1088
  self.norm = nn.LayerNorm(dim)
1089
 
1090
- self.video_mode = video_mode
1091
- if self.video_mode=='gttm':
1092
  # self.ttm = TokenTuringMachine(dim=dim, memory_size=128, memory_out_mode=True)
1093
  self.temporal_encoder = GroupedTokenTuringMachine(dim=dim, process_size=128, memory_size_per_group=4)
1094
- # self.temporal_encoder = GroupedTokenTuringMachine4(dim=dim, process_size=128, memory_size_per_group=4, output_size=32)
 
1095
 
1096
  def forward(self, x, vision_attn_masks):
1097
  """
@@ -2433,6 +2437,7 @@ class XGenMMVisionTokenizer(PreTrainedModel):
2433
  dim_inner=config.lang_embedding_dim,
2434
  # TODO: hardwiring for now...
2435
  num_latents=128,
 
2436
  )
2437
 
2438
  def forward(self, vision_features: torch.Tensor, vision_attn_masks: torch.Tensor):
 
52
  lang_embedding_dim: int = 3072,
53
  num_vis_tokens: int = 128,
54
  image_aspect_ratio: str = "anyres",
55
+ temporal_encoder_mode: str = 'gttm',
56
  **kwargs,
57
  ):
58
  self.vis_feature_dim = vis_feature_dim
59
  self.lang_embedding_dim = lang_embedding_dim
60
  self.num_vis_tokens = num_vis_tokens
61
  self.image_aspect_ratio = image_aspect_ratio
62
+ self.temporal_encoder_mode = temporal_encoder_mode
63
  super().__init__(**kwargs)
64
 
65
 
 
78
  vision_encoder_config = {
79
  "image_aspect_ratio": "pad",
80
  "anyres_patch_sampling": False,
81
+ "temporal_encoder_mode": "gttm",
82
  }
83
  logger.info(
84
  "vision_encoder_config is None. initializing the XGenMMVisionEncoderConfig with default values."
 
1037
  max_num_media=None,
1038
  max_num_frames=None,
1039
  ff_mult=4,
1040
+ temporal_encoder_mode='gttm',
1041
  ):
1042
  """
1043
  Perceiver module which takes in image features and outputs image tokens.
 
1090
 
1091
  self.norm = nn.LayerNorm(dim)
1092
 
1093
+ self.temporal_encoder_mode = temporal_encoder_mode
1094
+ if self.temporal_encoder_mode=='gttm':
1095
  # self.ttm = TokenTuringMachine(dim=dim, memory_size=128, memory_out_mode=True)
1096
  self.temporal_encoder = GroupedTokenTuringMachine(dim=dim, process_size=128, memory_size_per_group=4)
1097
+ elif self.temporal_encoder_mode=='gttm_pool':
1098
+ self.temporal_encoder = GroupedTokenTuringMachine4(dim=dim, process_size=128, memory_size_per_group=4, output_size=32)
1099
 
1100
  def forward(self, x, vision_attn_masks):
1101
  """
 
2437
  dim_inner=config.lang_embedding_dim,
2438
  # TODO: hardwiring for now...
2439
  num_latents=128,
2440
+ temporal_encoder_mode=config.temporal_encoder_mode,
2441
  )
2442
 
2443
  def forward(self, vision_features: torch.Tensor, vision_attn_masks: torch.Tensor):