amaye15 commited on
Commit
51a37a6
·
verified ·
1 Parent(s): dd4839e

Upload AIMv2ForImageClassification

Browse files
Files changed (4) hide show
  1. config.json +27 -0
  2. configuration_aimv2.py +60 -0
  3. model.safetensors +3 -0
  4. modeling_aimv2.py +308 -0
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "apple/aimv2-large-patch14-native",
3
+ "architectures": [
4
+ "AIMv2ForImageClassification"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "apple/aimv2-large-patch14-native--configuration_aimv2.AIMv2Config",
9
+ "AutoModel": "apple/aimv2-large-patch14-native--modeling_aimv2.AIMv2Model",
10
+ "AutoModelForImageClassification": "modeling_aimv2.AIMv2ForImageClassification",
11
+ "FlaxAutoModel": "apple/aimv2-large-patch14-native--modeling_flax_aimv2.FlaxAIMv2Model"
12
+ },
13
+ "hidden_size": 1024,
14
+ "intermediate_size": 2816,
15
+ "model_type": "aimv2",
16
+ "num_attention_heads": 8,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 24,
19
+ "num_queries": 256,
20
+ "patch_size": 14,
21
+ "projection_dropout": 0.0,
22
+ "qkv_bias": false,
23
+ "rms_norm_eps": 1e-05,
24
+ "torch_dtype": "float32",
25
+ "transformers_version": "4.46.3",
26
+ "use_bias": false
27
+ }
configuration_aimv2.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+
5
+ __all__ = ["AIMv2Config"]
6
+
7
+
8
+ class AIMv2Config(PretrainedConfig):
9
+ """This is the configuration class to store the configuration of an [`AIMv2Model`].
10
+ Instantiating a configuration with the defaults will yield a similar configuration
11
+ to that of the [apple/aimv2-large-patch14-native](https://huggingface.co/apple/aimv2-large-patch14-native)
12
+ Args:
13
+ hidden_size: Dimension of the hidden representations.
14
+ intermediate_size: Dimension of the SwiGLU representations.
15
+ num_hidden_layers: Number of hidden layers in the Transformer.
16
+ num_attention_heads: Number of attention heads for each attention layer
17
+ in the Transformer.
18
+ num_channels: Number of input channels.
19
+ num_queries: Number of learnable queries in the head.
20
+ patch_size: Patch size.
21
+ rms_norm_eps: Epsilon value used for the RMS normalization layer.
22
+ attention_dropout: Dropout ratio for attention probabilities.
23
+ projection_dropout: Dropout ratio for the projection layer after the attention.
24
+ qkv_bias: Whether to add a bias to the queries, keys and values.
25
+ use_bias: Whether to add a bias in the feed-forward and projection layers.
26
+ kwargs: Keyword arguments for the [`PretrainedConfig`].
27
+ """
28
+
29
+ model_type: str = "aimv2"
30
+
31
+ def __init__(
32
+ self,
33
+ hidden_size: int = 1024,
34
+ intermediate_size: int = 2816,
35
+ num_hidden_layers: int = 24,
36
+ num_attention_heads: int = 8,
37
+ num_channels: int = 3,
38
+ num_queries: int = 256,
39
+ patch_size: int = 14,
40
+ rms_norm_eps: float = 1e-5,
41
+ attention_dropout: float = 0.0,
42
+ projection_dropout: float = 0.0,
43
+ qkv_bias: bool = False,
44
+ use_bias: bool = False,
45
+ **kwargs: Any,
46
+ ):
47
+ super().__init__(**kwargs)
48
+ self.hidden_size = hidden_size
49
+ self.intermediate_size = intermediate_size
50
+ self.num_hidden_layers = num_hidden_layers
51
+ self.num_attention_heads = num_attention_heads
52
+ self.num_channels = num_channels
53
+ self.num_queries = num_queries
54
+ self.patch_size = patch_size
55
+ self.attention_dropout = attention_dropout
56
+ self.rms_norm_eps = rms_norm_eps
57
+
58
+ self.projection_dropout = projection_dropout
59
+ self.qkv_bias = qkv_bias
60
+ self.use_bias = use_bias
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:310c1a3ac285e0284e06f0df0b9e3e69fbdafb4d5724471d27c416b73bf41779
3
+ size 1235770128
modeling_aimv2.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple, Union
2
+
3
+ import torch
4
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
5
+ from .configuration_aimv2 import AIMv2Config
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+ from transformers.modeling_outputs import (
9
+ BaseModelOutputWithNoAttention,
10
+ ImageClassifierOutput,
11
+ )
12
+ from transformers.modeling_utils import PreTrainedModel
13
+
14
+ __all__ = ["AIMv2Model"]
15
+
16
+
17
+ def _get_1d_sincos_pos_embed_from_grid(
18
+ embed_dim: int, pos: torch.Tensor
19
+ ) -> torch.Tensor:
20
+ omega = torch.arange(embed_dim // 2).float()
21
+ omega /= embed_dim / 2.0
22
+ omega = 1.0 / 10000**omega # (D / 2,)
23
+ pos = pos.reshape(-1) # (M,)
24
+ out = pos[:, None] * omega[None, :] # (M, D / 2), outer product
25
+ emb_sin, emb_cos = torch.sin(out), torch.cos(out) # (M, D / 2)
26
+ emb = torch.concatenate([emb_sin, emb_cos], dim=1) # (M, D)
27
+ return emb
28
+
29
+
30
+ def get_sincos_pos_embed(h: int, w: int, embed_dim: int) -> torch.Tensor:
31
+ assert embed_dim % 2 == 0, embed_dim
32
+ grid_h = torch.arange(h).float()
33
+ grid_w = torch.arange(w).float()
34
+ grid = torch.meshgrid(grid_w, grid_h, indexing="xy")
35
+ grid = torch.stack(grid, dim=0)
36
+ grid = grid.reshape([2, 1, h, w])
37
+ emb_h = _get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])
38
+ emb_w = _get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])
39
+ pos_embed = torch.concatenate([emb_h, emb_w], dim=1) # (H * W, D)
40
+ return pos_embed
41
+
42
+
43
+ class RMSNorm(nn.Module):
44
+ def __init__(self, dim: int, eps: float = 1e-6):
45
+ super().__init__()
46
+ self.weight = nn.Parameter(torch.ones(dim))
47
+ self.eps = eps
48
+
49
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
50
+ output = self._norm(x.float()).type_as(x)
51
+ return output * self.weight
52
+
53
+ def extra_repr(self) -> str:
54
+ return f"{tuple(self.weight.shape)}, eps={self.eps}"
55
+
56
+ def _norm(self, x: torch.Tensor) -> torch.Tensor:
57
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
58
+
59
+
60
+ class AIMv2SwiGLUFFN(nn.Module):
61
+ def __init__(self, config: AIMv2Config):
62
+ super().__init__()
63
+ hidden_features = config.intermediate_size
64
+ in_features = config.hidden_size
65
+ bias = config.use_bias
66
+
67
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
68
+ self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
69
+ self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
70
+
71
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
72
+ x = F.silu(self.fc1(x)) * self.fc3(x)
73
+ x = self.fc2(x)
74
+ return x
75
+
76
+
77
+ class AIMv2PatchEmbed(nn.Module):
78
+ def __init__(self, config: AIMv2Config):
79
+ super().__init__()
80
+ self.proj = nn.Conv2d(
81
+ config.num_channels,
82
+ config.hidden_size,
83
+ kernel_size=(config.patch_size, config.patch_size),
84
+ stride=(config.patch_size, config.patch_size),
85
+ )
86
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
87
+
88
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
89
+ x = self.proj(x).flatten(2).transpose(1, 2)
90
+ x = self.norm(x)
91
+ return x
92
+
93
+
94
+ class AIMv2ViTPreprocessor(nn.Module):
95
+ def __init__(self, config: AIMv2Config):
96
+ super().__init__()
97
+ self.patch_h = config.patch_size
98
+ self.patch_w = config.patch_size
99
+ self.embed_dim = config.hidden_size
100
+
101
+ self.patchifier = AIMv2PatchEmbed(config)
102
+
103
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
104
+ _, _, H, W = x.shape
105
+ tokens = self.patchifier(x)
106
+ pos_embed = get_sincos_pos_embed(
107
+ H // self.patch_h, W // self.patch_w, embed_dim=self.embed_dim
108
+ )
109
+ tokens = tokens + pos_embed
110
+ return tokens
111
+
112
+
113
+ class AIMv2Attention(nn.Module):
114
+ def __init__(self, config: AIMv2Config):
115
+ super().__init__()
116
+ dim = config.hidden_size
117
+
118
+ self.num_heads = config.num_attention_heads
119
+ self.qkv = nn.Linear(dim, dim * 3, bias=config.qkv_bias)
120
+ self.attn_drop = nn.Dropout(config.attention_dropout)
121
+ self.proj = nn.Linear(dim, dim, bias=config.use_bias)
122
+ self.proj_drop = nn.Dropout(config.projection_dropout)
123
+
124
+ def forward(
125
+ self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
126
+ ) -> torch.Tensor:
127
+ B, N, C = x.shape
128
+ qkv = (
129
+ self.qkv(x)
130
+ .reshape(B, N, 3, self.num_heads, C // self.num_heads)
131
+ .permute(2, 0, 3, 1, 4)
132
+ )
133
+ q, k, v = qkv.unbind(0)
134
+
135
+ x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
136
+ x = x.transpose(1, 2).contiguous().reshape(B, N, C)
137
+ x = self.proj(x)
138
+ x = self.proj_drop(x)
139
+ return x
140
+
141
+
142
+ class AIMv2Block(nn.Module):
143
+ def __init__(self, config: AIMv2Config):
144
+ super().__init__()
145
+ self.attn = AIMv2Attention(config)
146
+ self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
147
+ self.mlp = AIMv2SwiGLUFFN(config)
148
+ self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
149
+
150
+ def forward(
151
+ self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
152
+ ) -> torch.Tensor:
153
+ x = x + self.attn(self.norm_1(x), mask)
154
+ x = x + self.mlp(self.norm_2(x))
155
+ return x
156
+
157
+
158
+ class AIMv2Transformer(nn.Module):
159
+ def __init__(self, config: AIMv2Config):
160
+ super().__init__()
161
+ self.blocks = nn.ModuleList(
162
+ [AIMv2Block(config) for _ in range(config.num_hidden_layers)]
163
+ )
164
+ self.post_trunk_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
165
+
166
+ def forward(
167
+ self,
168
+ tokens: torch.Tensor,
169
+ mask: Optional[torch.Tensor] = None,
170
+ output_hidden_states: bool = False,
171
+ ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, ...]]]:
172
+ hidden_states = () if output_hidden_states else None
173
+ for block in self.blocks:
174
+ tokens = block(tokens, mask)
175
+ if output_hidden_states:
176
+ hidden_states += (tokens,)
177
+ tokens = self.post_trunk_norm(tokens)
178
+ return tokens, hidden_states
179
+
180
+
181
+ class AIMv2PretrainedModel(PreTrainedModel):
182
+ config_class = AIMv2Config
183
+ base_model_prefix = "aimv2"
184
+ main_input_name = "pixel_values"
185
+ _supports_sdpa = True
186
+
187
+
188
+ class AIMv2Model(AIMv2PretrainedModel):
189
+ def __init__(self, config: AIMv2Config):
190
+ super().__init__(config)
191
+ self.preprocessor = AIMv2ViTPreprocessor(config)
192
+ self.trunk = AIMv2Transformer(config)
193
+
194
+ def forward(
195
+ self,
196
+ pixel_values: torch.Tensor,
197
+ mask: Optional[torch.Tensor] = None,
198
+ output_hidden_states: Optional[bool] = None,
199
+ return_dict: Optional[bool] = None,
200
+ ) -> Union[
201
+ Tuple[torch.Tensor],
202
+ Tuple[torch.Tensor, Tuple[torch.Tensor, ...]],
203
+ BaseModelOutputWithNoAttention,
204
+ ]:
205
+ if output_hidden_states is None:
206
+ output_hidden_states = self.config.output_hidden_states
207
+ if return_dict is None:
208
+ return_dict = self.config.use_return_dict
209
+
210
+ x = self.preprocessor(pixel_values)
211
+ x, hidden_states = self.trunk(
212
+ x, mask, output_hidden_states=output_hidden_states
213
+ )
214
+
215
+ if not return_dict:
216
+ res = (x,)
217
+ res += (hidden_states,) if output_hidden_states else ()
218
+ return res
219
+
220
+ return BaseModelOutputWithNoAttention(
221
+ last_hidden_state=x,
222
+ hidden_states=hidden_states,
223
+ )
224
+
225
+
226
+ class AIMv2ForImageClassification(AIMv2PretrainedModel):
227
+ def __init__(self, config: AIMv2Config):
228
+ super().__init__(config)
229
+
230
+ self.num_labels = config.num_labels
231
+ self.aimv2 = AIMv2Model(config)
232
+
233
+ # Classifier head
234
+ self.classifier = (
235
+ nn.Linear(config.hidden_size, config.num_labels)
236
+ if config.num_labels > 0
237
+ else nn.Identity()
238
+ )
239
+
240
+ # Initialize weights and apply final processing
241
+ self.post_init()
242
+
243
+ def forward(
244
+ self,
245
+ pixel_values: Optional[torch.Tensor] = None,
246
+ head_mask: Optional[torch.Tensor] = None,
247
+ labels: Optional[torch.Tensor] = None,
248
+ output_hidden_states: Optional[bool] = None,
249
+ return_dict: Optional[bool] = None,
250
+ ) -> Union[tuple, ImageClassifierOutput]:
251
+ r"""
252
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
253
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
254
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
255
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
256
+ """
257
+ return_dict = (
258
+ return_dict if return_dict is not None else self.config.use_return_dict
259
+ )
260
+
261
+ outputs = self.aimv2(
262
+ pixel_values,
263
+ mask=head_mask,
264
+ output_hidden_states=output_hidden_states,
265
+ return_dict=return_dict,
266
+ )
267
+
268
+ sequence_output = outputs[0]
269
+
270
+ logits = self.classifier(sequence_output[:, 0, :])
271
+
272
+ loss = None
273
+ if labels is not None:
274
+ # move labels to correct device to enable model parallelism
275
+ labels = labels.to(logits.device)
276
+ if self.config.problem_type is None:
277
+ if self.num_labels == 1:
278
+ self.config.problem_type = "regression"
279
+ elif self.num_labels > 1 and (
280
+ labels.dtype == torch.long or labels.dtype == torch.int
281
+ ):
282
+ self.config.problem_type = "single_label_classification"
283
+ else:
284
+ self.config.problem_type = "multi_label_classification"
285
+
286
+ if self.config.problem_type == "regression":
287
+ loss_fct = MSELoss()
288
+ if self.num_labels == 1:
289
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
290
+ else:
291
+ loss = loss_fct(logits, labels)
292
+ elif self.config.problem_type == "single_label_classification":
293
+ loss_fct = CrossEntropyLoss()
294
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
295
+ elif self.config.problem_type == "multi_label_classification":
296
+ loss_fct = BCEWithLogitsLoss()
297
+ loss = loss_fct(logits, labels)
298
+
299
+ if not return_dict:
300
+ output = (logits,) + outputs[1:]
301
+ return ((loss,) + output) if loss is not None else output
302
+
303
+ return ImageClassifierOutput(
304
+ loss=loss,
305
+ logits=logits,
306
+ hidden_states=outputs.hidden_states,
307
+ # attentions=outputs.attentions,
308
+ )