michaelryoo
commited on
Upload model
Browse files- README.md +1 -1
- config.json +3 -3
- generation_config.json +1 -1
- modeling_xgenmm.py +9 -4
README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
---
|
2 |
-
license: cc-by-nc-4.0
|
3 |
language:
|
4 |
- en
|
|
|
5 |
pipeline_tag: image-text-to-text
|
6 |
---
|
7 |
|
|
|
1 |
---
|
|
|
2 |
language:
|
3 |
- en
|
4 |
+
license: cc-by-nc-4.0
|
5 |
pipeline_tag: image-text-to-text
|
6 |
---
|
7 |
|
config.json
CHANGED
@@ -14,14 +14,14 @@
|
|
14 |
"torch_dtype": "bfloat16"
|
15 |
},
|
16 |
"torch_dtype": "float32",
|
17 |
-
"transformers_version": "4.
|
18 |
"vision_encoder_config": {
|
19 |
"anyres_patch_sampling": false,
|
20 |
"image_aspect_ratio": "pad",
|
21 |
-
"model_type": "xgenmm_vision_encoder"
|
|
|
22 |
},
|
23 |
"vision_tokenizer_config": {
|
24 |
-
"_attn_implementation_autoset": true,
|
25 |
"model_type": "xgenmm_vision_tokenizer"
|
26 |
}
|
27 |
}
|
|
|
14 |
"torch_dtype": "bfloat16"
|
15 |
},
|
16 |
"torch_dtype": "float32",
|
17 |
+
"transformers_version": "4.41.2",
|
18 |
"vision_encoder_config": {
|
19 |
"anyres_patch_sampling": false,
|
20 |
"image_aspect_ratio": "pad",
|
21 |
+
"model_type": "xgenmm_vision_encoder",
|
22 |
+
"temporal_encoder_mode": "gttm"
|
23 |
},
|
24 |
"vision_tokenizer_config": {
|
|
|
25 |
"model_type": "xgenmm_vision_tokenizer"
|
26 |
}
|
27 |
}
|
generation_config.json
CHANGED
@@ -3,5 +3,5 @@
|
|
3 |
"bos_token_id": 1,
|
4 |
"eos_token_id": 32000,
|
5 |
"pad_token_id": 32000,
|
6 |
-
"transformers_version": "4.
|
7 |
}
|
|
|
3 |
"bos_token_id": 1,
|
4 |
"eos_token_id": 32000,
|
5 |
"pad_token_id": 32000,
|
6 |
+
"transformers_version": "4.41.2"
|
7 |
}
|
modeling_xgenmm.py
CHANGED
@@ -52,12 +52,14 @@ class XGenMMVisionTokenizerConfig(PretrainedConfig):
|
|
52 |
lang_embedding_dim: int = 3072,
|
53 |
num_vis_tokens: int = 128,
|
54 |
image_aspect_ratio: str = "anyres",
|
|
|
55 |
**kwargs,
|
56 |
):
|
57 |
self.vis_feature_dim = vis_feature_dim
|
58 |
self.lang_embedding_dim = lang_embedding_dim
|
59 |
self.num_vis_tokens = num_vis_tokens
|
60 |
self.image_aspect_ratio = image_aspect_ratio
|
|
|
61 |
super().__init__(**kwargs)
|
62 |
|
63 |
|
@@ -76,6 +78,7 @@ class XGenMMConfig(PretrainedConfig):
|
|
76 |
vision_encoder_config = {
|
77 |
"image_aspect_ratio": "pad",
|
78 |
"anyres_patch_sampling": False,
|
|
|
79 |
}
|
80 |
logger.info(
|
81 |
"vision_encoder_config is None. initializing the XGenMMVisionEncoderConfig with default values."
|
@@ -1034,7 +1037,7 @@ class PerceiverResampler(VisionTokenizer):
|
|
1034 |
max_num_media=None,
|
1035 |
max_num_frames=None,
|
1036 |
ff_mult=4,
|
1037 |
-
|
1038 |
):
|
1039 |
"""
|
1040 |
Perceiver module which takes in image features and outputs image tokens.
|
@@ -1087,11 +1090,12 @@ class PerceiverResampler(VisionTokenizer):
|
|
1087 |
|
1088 |
self.norm = nn.LayerNorm(dim)
|
1089 |
|
1090 |
-
self.
|
1091 |
-
if self.
|
1092 |
# self.ttm = TokenTuringMachine(dim=dim, memory_size=128, memory_out_mode=True)
|
1093 |
self.temporal_encoder = GroupedTokenTuringMachine(dim=dim, process_size=128, memory_size_per_group=4)
|
1094 |
-
|
|
|
1095 |
|
1096 |
def forward(self, x, vision_attn_masks):
|
1097 |
"""
|
@@ -2433,6 +2437,7 @@ class XGenMMVisionTokenizer(PreTrainedModel):
|
|
2433 |
dim_inner=config.lang_embedding_dim,
|
2434 |
# TODO: hardwiring for now...
|
2435 |
num_latents=128,
|
|
|
2436 |
)
|
2437 |
|
2438 |
def forward(self, vision_features: torch.Tensor, vision_attn_masks: torch.Tensor):
|
|
|
52 |
lang_embedding_dim: int = 3072,
|
53 |
num_vis_tokens: int = 128,
|
54 |
image_aspect_ratio: str = "anyres",
|
55 |
+
temporal_encoder_mode: str = 'gttm',
|
56 |
**kwargs,
|
57 |
):
|
58 |
self.vis_feature_dim = vis_feature_dim
|
59 |
self.lang_embedding_dim = lang_embedding_dim
|
60 |
self.num_vis_tokens = num_vis_tokens
|
61 |
self.image_aspect_ratio = image_aspect_ratio
|
62 |
+
self.temporal_encoder_mode = temporal_encoder_mode
|
63 |
super().__init__(**kwargs)
|
64 |
|
65 |
|
|
|
78 |
vision_encoder_config = {
|
79 |
"image_aspect_ratio": "pad",
|
80 |
"anyres_patch_sampling": False,
|
81 |
+
"temporal_encoder_mode": "gttm",
|
82 |
}
|
83 |
logger.info(
|
84 |
"vision_encoder_config is None. initializing the XGenMMVisionEncoderConfig with default values."
|
|
|
1037 |
max_num_media=None,
|
1038 |
max_num_frames=None,
|
1039 |
ff_mult=4,
|
1040 |
+
temporal_encoder_mode='gttm',
|
1041 |
):
|
1042 |
"""
|
1043 |
Perceiver module which takes in image features and outputs image tokens.
|
|
|
1090 |
|
1091 |
self.norm = nn.LayerNorm(dim)
|
1092 |
|
1093 |
+
self.temporal_encoder_mode = temporal_encoder_mode
|
1094 |
+
if self.temporal_encoder_mode=='gttm':
|
1095 |
# self.ttm = TokenTuringMachine(dim=dim, memory_size=128, memory_out_mode=True)
|
1096 |
self.temporal_encoder = GroupedTokenTuringMachine(dim=dim, process_size=128, memory_size_per_group=4)
|
1097 |
+
elif self.temporal_encoder_mode=='gttm_pool':
|
1098 |
+
self.temporal_encoder = GroupedTokenTuringMachine4(dim=dim, process_size=128, memory_size_per_group=4, output_size=32)
|
1099 |
|
1100 |
def forward(self, x, vision_attn_masks):
|
1101 |
"""
|
|
|
2437 |
dim_inner=config.lang_embedding_dim,
|
2438 |
# TODO: hardwiring for now...
|
2439 |
num_latents=128,
|
2440 |
+
temporal_encoder_mode=config.temporal_encoder_mode,
|
2441 |
)
|
2442 |
|
2443 |
def forward(self, vision_features: torch.Tensor, vision_attn_masks: torch.Tensor):
|