Merge pull request #7 from recursionpharma/more-code

Browse files

Files changed (7) hide show

README.md +10 -8
config.yaml +15 -0
loss.py +50 -0
mae_modules.py +272 -0
mae_utils.py +64 -0
masking.py +46 -0
vit.py +284 -0

README.md CHANGED Viewed

@@ -1,13 +1,17 @@
 # Masked Autoencoders are Scalable Learners of Cellular Morphology
-Official repo for Recursion's accepted spotlight paper at [NeurIPS 2023 Generative AI &amp; Biology workshop](https://openreview.net/group?id=NeurIPS.cc/2023/Workshop/GenBio).
-Paper: https://arxiv.org/abs/2309.16064
 ![vit_diff_mask_ratios](https://github.com/recursionpharma/maes_microscopy/assets/109550980/c15f46b1-cdb9-41a7-a4af-bdc9684a971d)
 ## Provided code
-The baseline Vision Transformer architecture backbone used in this work can be built with the following code snippet from Timm:
 ```
 import timm.models.vision_transformer as vit
@@ -29,11 +33,9 @@ def vit_base_patch16_256(**kwargs):
     return vit.vit_base_patch16_224(**default_kwargs)
 ```
-Additional code will be released as the date of the workshop gets closer.
-**While we cannot share all the internal code we've written training and evaluation of these models, it would be very useful if interested persons could raise an Issue in this repo to inform us as to what the most useful aspects of the code for this project would be of interest to the broader community.**
 ## Provided models
 We have partnered with Nvidia to host a publicly-available smaller and more flexible version of the MAE phenomics foundation model, called Phenom-Beta. Interested parties can access it directly through the Nvidia BioNemo API:
 - https://blogs.nvidia.com/blog/drug-discovery-bionemo-generative-ai/
 - https://www.youtube.com/watch?v=Gch6bX1toB0

 # Masked Autoencoders are Scalable Learners of Cellular Morphology
+Official repo for Recursion's two recently accepted papers:
+- Spotlight full-length paper at [CVPR 2024](https://cvpr.thecvf.com/Conferences/2024/AcceptedPapers) -- Masked Autoencoders for Microscopy are Scalable Learners of Cellular Biology
+  - Paper: link to be shared soon!
+- Spotlight workshop paper at [NeurIPS 2023 Generative AI &amp; Biology workshop](https://openreview.net/group?id=NeurIPS.cc/2023/Workshop/GenBio)
+  - Paper: https://arxiv.org/abs/2309.16064
 ![vit_diff_mask_ratios](https://github.com/recursionpharma/maes_microscopy/assets/109550980/c15f46b1-cdb9-41a7-a4af-bdc9684a971d)
 ## Provided code
+See the repo for ingredients required for defining our MAEs. Users seeking to re-implement training will need to stitch together the Encoder and Decoder modules according to their usecase.
+Furthermore the baseline Vision Transformer architecture backbone used in this work can be built with the following code snippet from Timm:
 ```
 import timm.models.vision_transformer as vit
     return vit.vit_base_patch16_224(**default_kwargs)
 ```
 ## Provided models
+A publicly available model for research can be found via Nvidia's BioNemo platform, which handles inference and auto-scaling for you: https://www.rxrx.ai/phenom
 We have partnered with Nvidia to host a publicly-available smaller and more flexible version of the MAE phenomics foundation model, called Phenom-Beta. Interested parties can access it directly through the Nvidia BioNemo API:
 - https://blogs.nvidia.com/blog/drug-discovery-bionemo-generative-ai/
 - https://www.youtube.com/watch?v=Gch6bX1toB0

config.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+loss:
+  _target_: torch.nn.MSELoss  # combine with fourier loss weighted at 0.01 mixing factor for best results
+  reduction: none
+optimizer:
+  _target_: timm.optim.lion.Lion
+  _partial_: true
+  lr: *lr 1e-4   # 1e-4 for <= ViT-B, and 3e-5 for ViT-L
+  weight_decay: 0.05
+  betas: [0.9, 0.95]
+lr_scheduler:
+  _target_: torch.optim.lr_scheduler.OneCycleLR
+  _partial_: true
+  max_lr: @lr
+  pct_start: 0.1
+  anneal_strategy: cos

loss.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torch.nn as nn
+class FourierLoss(nn.Module):
+    def __init__(
+        self,
+        use_l1_loss: bool = True,
+        num_multimodal_modalities: int = 1,  # set to 1 for vanilla MAE, 6 for channel-agnostic MAE
+    ) -> None:
+        """
+        Fourier transform loss is only sound when using L1 or L2 loss to compare the frequency domains
+        between the images / their radial histograms.
+        We will always set `reduction="none"` and enforce that the computation of any reductions from the
+        output of this loss be managed by the model under question.
+        """
+        super().__init__()
+        self.loss = nn.L1Loss(reduction="none") if use_l1_loss else nn.MSELoss(reduction="none")
+        self.num_modalities = num_multimodal_modalities
+    def forward(self, input: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
+        # input = reconstructed image, target = original image
+        # flattened images from MAE are (B, H*W, C), so, here we convert to B x C x H x W (note we assume H == W)
+        flattened_images = len(input.shape) == len(target.shape) == 3
+        if flattened_images:
+            B, H_W, C = input.shape
+            H_W = H_W // self.num_modalities
+            four_d_shape = (B, C * self.num_modalities, int(H_W**0.5), int(H_W**0.5))
+            input = input.view(*four_d_shape)
+            target = target.view(*four_d_shape)
+        else:
+            B, C, h, w = input.shape
+            H_W = h * w
+        if len(input.shape) != len(target.shape) != 4:
+            raise ValueError(f"Invalid input shape: got {input.shape} and {target.shape}.")
+        fft_reconstructed = torch.fft.fft2(input)
+        fft_original = torch.fft.fft2(target)
+        magnitude_reconstructed = torch.abs(fft_reconstructed)
+        magnitude_original = torch.abs(fft_original)
+        loss_tensor: torch.Tensor = self.loss(magnitude_reconstructed, magnitude_original)
+        if flattened_images and not self.num_bins:  # then output loss should be reshaped
+            loss_tensor = loss_tensor.reshape(B, H_W * self.num_modalities, C)
+        return loss_tensor

mae_modules.py ADDED Viewed

	@@ -0,0 +1,272 @@

+from functools import partial
+from typing import Tuple, Union
+import torch
+import torch.nn as nn
+from timm.models.helpers import checkpoint_seq
+from timm.models.vision_transformer import Block, Mlp, VisionTransformer
+from .masking import transformer_random_masking
+from .vit import channel_agnostic_vit
+# If interested in training new MAEs, combine an encoder and decoder into a new module, and you should
+# leverage the flattening and unflattening utilities as needed from mae_utils.py.
+# Be sure to use an encoder-decoder Linear projection layer to match encoder dims with decoder dimensions.
+# As described in the paper, images are self-standardized at the start.
+class SelfStandardize(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.self_standardize = nn.LazyInstanceNorm2d(
+            affine=False, track_running_stats=False
+        )
+    def forward(self, pixels: torch.Tensor) -> torch.Tensor:
+        x = pixels.float() / 255.0
+        return self.self_standardize(x)
+class MAEEncoder(nn.Module):
+    def __init__(
+        self,
+        vit_backbone: VisionTransformer,
+        max_in_chans: int = 6,
+        channel_agnostic: bool = False,
+    ) -> None:
+        super().__init__()
+        if channel_agnostic:
+            self.vit_backbone = channel_agnostic_vit(
+                vit_backbone, max_in_chans=max_in_chans
+            )
+        else:
+            self.vit_backbone = vit_backbone
+        self.max_in_chans = max_in_chans
+        self.channel_agnostic = channel_agnostic
+    @property
+    def embed_dim(self) -> int:
+        return int(self.vit_backbone.embed_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.vit_backbone.forward_features(x)
+        x = self.vit_backbone.forward_head(x)
+        return x  # type: ignore[no-any-return]
+    def forward_masked(
+        self,
+        x: torch.Tensor,
+        mask_ratio: float,
+        constant_noise: Union[torch.Tensor, None] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        x = self.vit_backbone.patch_embed(x)
+        x = self.vit_backbone._pos_embed(x)  # adds class token
+        x_ = x[:, 1:, :]  # no class token
+        x_, mask, ind_restore = transformer_random_masking(
+            x_, mask_ratio, constant_noise
+        )
+        x = torch.cat([x[:, :1, :], x_], dim=1)  # add class token
+        x = self.vit_backbone.norm_pre(x)
+        if self.vit_backbone.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.vit_backbone.blocks, x)
+        else:
+            x = self.vit_backbone.blocks(x)
+        x = self.vit_backbone.norm(x)
+        return x, mask, ind_restore
+class MAEDecoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int = 512,
+        depth: int = 8,
+        num_heads: int = 16,
+        mlp_ratio: float = 4,
+        qkv_bias: bool = True,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),  # type: ignore[assignment]
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.pos_embeddings = None  # to be overwritten by MAE class
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.blocks = nn.Sequential(
+            *[
+                Block(
+                    embed_dim,
+                    num_heads,
+                    mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = norm_layer(embed_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.pos_embeddings
+        x = self.blocks(x)
+        x = self.norm(x)
+        return x  # type: ignore[no-any-return]
+    def forward_masked(
+        self, x: torch.Tensor, ind_restore: torch.Tensor
+    ) -> torch.Tensor:
+        mask_tokens = self.mask_token.repeat(
+            x.shape[0], ind_restore.shape[1] + 1 - x.shape[1], 1
+        )
+        x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1)  # remove class token
+        x_ = torch.gather(
+            x_, dim=1, index=ind_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])
+        )  # unshuffle
+        x = torch.cat([x[:, :1, :], x_], dim=1)  # add class token
+        x = x + self.pos_embeddings
+        x = self.blocks(x)
+        x = self.norm(x)
+        return x  # type: ignore[no-any-return]
+class CrossAttention(nn.Module):
+    def __init__(
+        self, embed_dim, num_heads=8, qkv_bias=False, attn_drop=0.0, proj_drop=0.0
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = embed_dim // num_heads
+        self.scale = head_dim**-0.5
+        self.q = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+        self.kv = nn.Linear(embed_dim, embed_dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(embed_dim, embed_dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, context):
+        B, N, C = x.shape
+        _, M, _ = context.shape
+        q = (
+            self.q(x)
+            .reshape(B, N, self.num_heads, C // self.num_heads)
+            .permute(0, 2, 1, 3)
+        )
+        kv = (
+            self.kv(context)
+            .reshape(B, M, 2, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        k, v = kv[0], kv[1]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class CAMAEDecoder(nn.Module):
+    def __init__(
+        self,
+        num_modalities: int = 6,
+        tokens_per_modality: int = 256,
+        embed_dim: int = 256,
+        depth: int = 2,
+        num_heads: int = 16,
+        mlp_ratio: float = 4,
+        qkv_bias: bool = True,
+        norm_layer: nn.Module = partial(nn.LayerNorm, eps=1e-6),  # type: ignore[assignment]
+    ) -> None:
+        super().__init__()
+        self.num_modalities = num_modalities
+        self.tokens_per_modality = tokens_per_modality
+        self.embed_dim = embed_dim
+        self.pos_embeddings = None  # to be overwritten by MAE class
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.placeholder = nn.Parameter(
+            torch.zeros(1, 1, embed_dim), requires_grad=False
+        )
+        self.modality_tokens = nn.ParameterList(
+            [
+                nn.Parameter(torch.zeros(1, 1, self.embed_dim))
+                for modality in range(self.num_modalities)
+            ]
+        )
+        self.cross_attention = CrossAttention(embed_dim=self.embed_dim)
+        self.mlp = Mlp(self.embed_dim, hidden_features=int(self.embed_dim * mlp_ratio))
+        self.decoders = nn.ModuleList(
+            [
+                nn.Sequential(
+                    *[
+                        Block(
+                            embed_dim,
+                            num_heads,
+                            mlp_ratio,
+                            qkv_bias=qkv_bias,
+                            norm_layer=norm_layer,
+                        )
+                        for i in range(depth)
+                    ]
+                )
+                for modality in range(self.num_modalities)
+            ]
+        )
+        # self.norm = norm_layer(embed_dim)  # we decided to drop the last layer norm
+        self.context_norm = norm_layer(embed_dim)
+        self.query_norm = norm_layer(embed_dim)
+        self.out_norm = norm_layer(embed_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x_m_s = []
+        modality_tokens_concat = torch.cat(
+            [
+                self.placeholder,
+            ]  # placeholder for class token
+            + [
+                m_t.repeat(1, self.tokens_per_modality, 1)
+                for m_t in self.modality_tokens
+            ],
+            dim=1,
+        )
+        x = (
+            x + self.pos_embeddings + modality_tokens_concat
+        )  # add pos and tiled modality tokens
+        x_ = x[:, 1:, :]  # no class token
+        for m, decoder in enumerate(
+            self.decoders
+        ):  # iterate through modalities and decoders
+            x_m = x_[
+                :, m * self.tokens_per_modality : (m + 1) * self.tokens_per_modality, :
+            ]
+            x_m = self.cross_attention(self.query_norm(x_m), self.context_norm(x_))
+            x_m = x_m + self.mlp(self.out_norm(x_m))
+            x_m = decoder(x_m)
+            x_m_s.append(x_m)
+        x_m_s = torch.cat(x_m_s, dim=1)  # concat all tokens
+        # x_m_s = self.norm(x_m_s)  # we decided to drop the last layer norm
+        x_m_s = torch.cat([x[:, :1, :], x_m_s], dim=1)  # add back class token
+        return x_m_s
+    def forward_masked(
+        self, x: torch.Tensor, ind_restore: torch.Tensor
+    ) -> torch.Tensor:
+        mask_tokens = self.mask_token.repeat(
+            x.shape[0], ind_restore.shape[1] + 1 - x.shape[1], 1
+        )
+        x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1)  # remove class token
+        x_ = torch.gather(
+            x_, dim=1, index=ind_restore.unsqueeze(-1).repeat(1, 1, x.shape[2])
+        )  # unshuffle
+        x = torch.cat([x[:, :1, :], x_], dim=1)  # add class token
+        x = self.forward(x)
+        return x

mae_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import math
+import torch
+def flatten_images(img: torch.Tensor, patch_size: int, channel_agnostic: bool = False) -> torch.Tensor:
+    """
+    Flattens 2D images into tokens with the same pixel values
+    Parameters
+    ----------
+    img : input image tensor (N, C, H, W)
+    Returns
+    -------
+    flattened_img: flattened image tensor (N, L, patch_size**2 * C)
+    """
+    if (img.shape[2] != img.shape[3]) or (img.shape[2] % patch_size != 0):
+        raise ValueError("image H must equal image W and be divisible by patch_size")
+    in_chans = img.shape[1]
+    h = w = int(img.shape[2] // patch_size)
+    x = img.reshape(shape=(img.shape[0], in_chans, h, patch_size, w, patch_size))
+    if channel_agnostic:
+        x = torch.permute(x, (0, 1, 2, 4, 3, 5))  # NCHPWQ -> NCHWPQ
+        x = x.reshape(shape=(img.shape[0], in_chans * h * w, int(patch_size**2)))
+    else:
+        x = torch.permute(x, (0, 2, 4, 3, 5, 1))  # NCHPWQ -> NHWPQC
+        x = x.reshape(shape=(img.shape[0], h * w, int(patch_size**2 * in_chans)))
+    return x
+def unflatten_tokens(
+    tokens: torch.Tensor, patch_size: int, num_modalities: int = 1, channel_agnostic: bool = False
+) -> torch.Tensor:
+    """
+    Unflattens tokens (N,L,patch_size**2 * C) into image tensor (N,C,H,W) with the pixel values
+    Parameters
+    ----------
+    tokens : input token tensor (N,L,patch_size**2 * C)
+    Returns
+    -------
+    img: image tensor (N,C,H,W)
+    """
+    if num_modalities > 1 and not channel_agnostic:
+        raise ValueError("Multiple modalities requires channel agnostic unflattening.")
+    h = w = int(math.sqrt(tokens.shape[1] // num_modalities))
+    if h * w != (tokens.shape[1] // num_modalities):
+        raise ValueError("sqrt of number of tokens not integer")
+    if channel_agnostic:
+        x = tokens.reshape(shape=(tokens.shape[0], -1, h, w, patch_size, patch_size))
+        x = torch.permute(x, (0, 1, 2, 4, 3, 5))  # NCHWPQ -> NCHPWQ
+    else:
+        x = tokens.reshape(shape=(tokens.shape[0], h, w, patch_size, patch_size, -1))
+        x = torch.permute(x, (0, 5, 1, 3, 2, 4))  # NHWPQC -> NCHPWQ
+    img = x.reshape(shape=(x.shape[0], -1, h * patch_size, h * patch_size))
+    return img

masking.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from typing import Tuple, Union
+import torch
+def transformer_random_masking(
+    x: torch.Tensor, mask_ratio: float, constant_noise: Union[torch.Tensor, None] = None
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Random mask patches per sample
+    Parameters
+    ----------
+    x : token tensor (N, L, D)
+    mask_ratio: float - ratio of image to mask
+    constant_noise: None, if provided should be a tensor of shape (N, L) to produce consistent masks
+    Returns
+    -------
+    x_masked : sub-sampled version of x ( int(mask_ratio * N), L, D)
+    mask : binary mask indicated masked tokens (1 where masked) (N, L)
+    ind_restore : locations of masked tokens, needed for decoder
+    """
+    N, L, D = x.shape  # batch, length, dim
+    len_keep = int(L * (1 - mask_ratio))
+    # use random noise to generate batch based random masks
+    if constant_noise is not None:
+        noise = constant_noise
+    else:
+        noise = torch.rand(N, L, device=x.device)
+    shuffled_tokens = torch.argsort(noise, dim=1)  # shuffled index
+    ind_restore = torch.argsort(shuffled_tokens, dim=1)  # unshuffled index
+    # get masked input
+    tokens_to_keep = shuffled_tokens[:, :len_keep]  # keep the first len_keep indices
+    x_masked = torch.gather(x, dim=1, index=tokens_to_keep.unsqueeze(-1).repeat(1, 1, D))
+    # get binary mask used for loss masking: 0 is keep, 1 is remove
+    mask = torch.ones([N, L], device=x.device)
+    mask[:, :len_keep] = 0
+    mask = torch.gather(mask, dim=1, index=ind_restore)  # unshuffle to get the binary mask
+    return x_masked, mask, ind_restore

vit.py ADDED Viewed

	@@ -0,0 +1,284 @@

+import timm.models.vision_transformer as vit
+import torch
+def generate_2d_sincos_pos_embeddings(
+    embedding_dim: int, length: int, scale: float = 10000.0, use_class_token: bool = True, num_modality: int = 1
+) -> torch.nn.Parameter:
+    """
+    Generate 2Dimensional sin/cosine positional embeddings
+    Parameters
+    ----------
+    embedding_dim : int
+        embedding dimension used in vit
+    length : int
+        number of tokens along height or width of image after patching (assuming square)
+    scale : float
+        scale for sin/cos functions
+    use_class_token : bool
+        True - add zero vector to be added to class_token, False - no vector added
+    num_modality: number of modalities. If 0, a single modality is assumed.
+        Otherwise one-hot modality encoding is added and sincos encoding size is appropriately reduced.
+    Returns
+    -------
+    positional_encoding : torch.Tensor
+        positional encoding to add to vit patch encodings
+        [num_modality*length*length, embedding_dim] or [1+num_modality*length*length, embedding_dim]
+        (w/ or w/o cls_token)
+    """
+    linear_positions = torch.arange(length, dtype=torch.float32)
+    height_mesh, width_mesh = torch.meshgrid(linear_positions, linear_positions, indexing="ij")
+    positional_dim = embedding_dim // 4  # accomodate h and w x cos and sin embeddings
+    positional_weights = torch.arange(positional_dim, dtype=torch.float32) / positional_dim
+    positional_weights = 1.0 / (scale**positional_weights)
+    height_weights = torch.outer(height_mesh.flatten(), positional_weights)
+    width_weights = torch.outer(width_mesh.flatten(), positional_weights)
+    positional_encoding = torch.cat(
+        [torch.sin(height_weights), torch.cos(height_weights), torch.sin(width_weights), torch.cos(width_weights)],
+        dim=1,
+    )[None, :, :]
+    # repeat positional encoding for multiple channel modalities
+    positional_encoding = positional_encoding.repeat(1, num_modality, 1)
+    if use_class_token:
+        class_token = torch.zeros([1, 1, embedding_dim], dtype=torch.float32)
+        positional_encoding = torch.cat([class_token, positional_encoding], dim=1)
+    positional_encoding = torch.nn.Parameter(positional_encoding, requires_grad=False)
+    return positional_encoding
+class ChannelAgnosticPatchEmbed(vit.PatchEmbed):  # type: ignore[misc]
+    def __init__(
+        self,
+        img_size: int,
+        patch_size: int,
+        embed_dim: int,
+        bias: bool = True,
+    ) -> None:
+        super().__init__(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=1,  # in_chans is used by self.proj, which we override anyway
+            embed_dim=embed_dim,
+            norm_layer=None,
+            flatten=False,
+            bias=bias,
+        )
+        # channel-agnostic MAE has a single projection for all chans
+        self.proj = torch.nn.Conv2d(1, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        in_chans = x.shape[1]
+        x = torch.stack([self.proj(x[:, i : i + 1]) for i in range(in_chans)], dim=2)  # single project for all chans
+        x = x.flatten(2).transpose(1, 2)  # BCMHW -> BNC
+        return x
+class ChannelAgnosticViT(vit.VisionTransformer):  # type: ignore[misc]
+    def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
+        # rewrite https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L586
+        to_cat = []
+        if self.cls_token is not None:
+            to_cat.append(self.cls_token.expand(x.shape[0], -1, -1))
+        # TODO: upgrade timm to get access to register tokens
+        # if self.vit_backbone.reg_token is not None:
+        #     to_cat.append(self.reg_token.expand(x.shape[0], -1, -1))
+        # MAIN DIFFERENCE with Timm - we DYNAMICALLY ADDING POS EMBEDDINGS based on shape of inputs
+        # this supports having CA-MAEs actually be channel-agnostic at inference time
+        if self.no_embed_class:
+            x = x + self.pos_embed[:, : x.shape[1]]
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+        else:
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+            x = x + self.pos_embed[:, : x.shape[1]]
+        return self.pos_drop(x)  # type: ignore[no-any-return]
+def channel_agnostic_vit(vit_backbone: vit.VisionTransformer, max_in_chans: int) -> vit.VisionTransformer:
+    # replace patch embedding with channel-agnostic version
+    vit_backbone.patch_embed = ChannelAgnosticPatchEmbed(
+        img_size=vit_backbone.patch_embed.img_size[0],
+        patch_size=vit_backbone.patch_embed.patch_size[0],
+        embed_dim=vit_backbone.embed_dim,
+    )
+    # replace positional embedding with channel-agnostic version
+    vit_backbone.pos_embed = generate_2d_sincos_pos_embeddings(
+        embedding_dim=vit_backbone.embed_dim,
+        length=vit_backbone.patch_embed.grid_size[0],
+        use_class_token=vit_backbone.cls_token is not None,
+        num_modality=max_in_chans,
+    )
+    # change the class to be ChannelAgnostic so that it actually uses the new _pos_embed
+    vit_backbone.__class__ = ChannelAgnosticViT
+    return vit_backbone
+def sincos_positional_encoding_vit(
+    vit_backbone: vit.VisionTransformer, scale: float = 10000.0
+) -> vit.VisionTransformer:
+    """Attaches no-grad sin-cos positional embeddings to a pre-constructed ViT backbone model.
+    Parameters
+    ----------
+    vit_backbone : timm.models.vision_transformer.VisionTransformer
+        the constructed vision transformer from timm
+    scale : float (default 10000.0)
+        hyperparameter for sincos positional embeddings, recommend keeping at 10,000
+    Returns
+    -------
+    timm.models.vision_transformer.VisionTransformer
+        the same ViT but with fixed no-grad positional encodings to add to vit patch encodings
+    """
+    # length: number of tokens along height or width of image after patching (assuming square)
+    length = vit_backbone.patch_embed.img_size[0] // vit_backbone.patch_embed.patch_size[0]
+    pos_embeddings = generate_2d_sincos_pos_embeddings(
+        vit_backbone.embed_dim, length=length, scale=scale, use_class_token=vit_backbone.cls_token is not None
+    )
+    # note, if the model had weight_init == 'skip', this might get overwritten
+    vit_backbone.pos_embed = pos_embeddings
+    return vit_backbone
+def vit_small_patch16_256(**kwargs):
+    default_kwargs = dict(
+        img_size=256,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.1,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_small_patch16_224(**default_kwargs)
+def vit_small_patch32_512(**kwargs):
+    default_kwargs = dict(
+        img_size=512,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.1,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_small_patch32_384(**default_kwargs)
+def vit_base_patch8_256(**kwargs):
+    default_kwargs = dict(
+        img_size=256,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.1,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_base_patch8_224(**default_kwargs)
+def vit_base_patch16_256(**kwargs):
+    default_kwargs = dict(
+        img_size=256,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.1,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_base_patch16_224(**default_kwargs)
+def vit_base_patch32_512(**kwargs):
+    default_kwargs = dict(
+        img_size=512,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.1,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_base_patch32_384(**default_kwargs)
+def vit_large_patch8_256(**kwargs):
+    default_kwargs = dict(
+        img_size=256,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        patch_size=8,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        drop_path_rate=0.3,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.VisionTransformer(**default_kwargs)
+def vit_large_patch16_256(**kwargs):
+    default_kwargs = dict(
+        img_size=256,
+        in_chans=6,
+        num_classes=0,
+        fc_norm=None,
+        class_token=True,
+        drop_path_rate=0.3,
+        init_values=0.0001,
+        block_fn=vit.ParallelScalingBlock,
+        qkv_bias=False,
+        qk_norm=True,
+    )
+    for k, v in kwargs.items():
+        default_kwargs[k] = v
+    return vit.vit_large_patch16_384(**default_kwargs)