qninhdt
/

swim

Model card Files Files and versions Community

qninhdt commited on Oct 15, 2024

Commit

798fdd3

1 Parent(s): be91bac

cc

Browse files

Files changed (6) hide show

scripts/build_cyclegan_dataset.py +6 -0
swim/autoencoder.py +247 -0
swim/blocks.py +53 -11
swim/codeblock.py +1 -1
swim/encoder.py +0 -90
train.py +67 -3

scripts/build_cyclegan_dataset.py CHANGED Viewed

@@ -71,6 +71,9 @@ def build_cyclegan_dataset(swim_dir: str, output_dir: str, type: str, no_night:
                 )
     else:
         for label in tqdm(train_labels, desc="train"):
             if label["timeofday"] == "night":
                 os.system(
                     f"cp {os.path.join(swim_dir, 'train', 'images', label['name'])} {os.path.join(output_dir, 'trainB', label['name'])}"
@@ -81,6 +84,9 @@ def build_cyclegan_dataset(swim_dir: str, output_dir: str, type: str, no_night:
                 )
         for label in tqdm(val_labels, desc="val"):
             if label["timeofday"] == "night":
                 os.system(
                     f"cp {os.path.join(swim_dir, 'val', 'images', label['name'])} {os.path.join(output_dir, 'testB', label['name'])}"

                 )
     else:
         for label in tqdm(train_labels, desc="train"):
+            if label["weather"] != "clear":
+                continue
             if label["timeofday"] == "night":
                 os.system(
                     f"cp {os.path.join(swim_dir, 'train', 'images', label['name'])} {os.path.join(output_dir, 'trainB', label['name'])}"
                 )
         for label in tqdm(val_labels, desc="val"):
+            if label["weather"] != "clear":
+                continue
             if label["timeofday"] == "night":
                 os.system(
                     f"cp {os.path.join(swim_dir, 'val', 'images', label['name'])} {os.path.join(output_dir, 'testB', label['name'])}"

swim/autoencoder.py CHANGED Viewed

	@@ -0,0 +1,247 @@

+from typing import List
+import torch
+import torch.nn.functional as F
+from torch import nn
+from .blocks import (
+    ResnetBlock,
+    AttentionBlock,
+    GroupNorm,
+    UpSampleBlock,
+    DownSampleBlock,
+)
+class Autoencoder(nn.Module):
+    def __init__(
+        self,
+        channels: int,
+        channel_multipliers: List[int],
+        n_resnet_blocks: int,
+        in_channels: int,
+        z_channels: int,
+        emb_channels: int,
+    ):
+        super().__init__()
+        self.encoder = Encoder(
+            channels=channels,
+            channel_multipliers=channel_multipliers,
+            n_resnet_blocks=n_resnet_blocks,
+            in_channels=in_channels,
+            z_channels=z_channels,
+        )
+        self.decoder = Decoder(
+            channels=channels,
+            channel_multipliers=channel_multipliers,
+            n_resnet_blocks=n_resnet_blocks,
+            out_channels=in_channels,
+            z_channels=z_channels,
+        )
+        # Convolution to map from embedding space to
+        # quantized embedding space moments (mean and log variance)
+        self.quant_conv = nn.Conv2d(2 * z_channels, 2 * emb_channels, 1)
+        # Convolution to map from quantized embedding space back to
+        # embedding space
+        self.post_quant_conv = nn.Conv2d(emb_channels, z_channels, 1)
+    def encode(self, img: torch.Tensor) -> "GaussianDistribution":
+        # Get embeddings with shape `[batch_size, z_channels * 2, z_height, z_height]`
+        z = self.encoder(img)
+        # Get the moments in the quantized embedding space
+        moments = self.quant_conv(z)
+        # Return the distribution
+        return GaussianDistribution(moments)
+    def decode(self, z: torch.Tensor):
+        # Map to embedding space from the quantized representation
+        z = self.post_quant_conv(z)
+        # Decode the image of shape `[batch_size, channels, height, width]`
+        return self.decoder(z)
+    def forward(self, x: torch.Tensor, sample_posterior: bool = False):
+        posterior = self.encode(x)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        decoded_x = self.decode(z)
+        return decoded_x, posterior
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        channels: int,
+        channel_multipliers: List[int],
+        n_resnet_blocks: int,
+        in_channels: int,
+        z_channels: int
+    ):
+        super().__init__()
+        # Number of blocks of different resolutions.
+        # The resolution is halved at the end each top level block
+        n_resolutions = len(channel_multipliers)
+        # Initial $3 \times 3$ convolution layer that maps the image to `channels`
+        self.conv_in = nn.Conv2d(in_channels, channels, 3, stride=1, padding=1)
+        # Number of channels in each top level block
+        channels_list = [m * channels for m in [1] + channel_multipliers]
+        # List of top-level blocks
+        self.down = nn.ModuleList()
+        # Create top-level blocks
+        for i in range(n_resolutions):
+            # Each top level block consists of multiple ResNet Blocks and down-sampling
+            resnet_blocks = nn.ModuleList()
+            # Add ResNet Blocks
+            for _ in range(n_resnet_blocks):
+                resnet_blocks.append(ResnetBlock(channels, channels_list[i + 1]))
+                channels = channels_list[i + 1]
+            # Top-level block
+            down = nn.Module()
+            down.block = resnet_blocks
+            # Down-sampling at the end of each top level block except the last
+            if i != n_resolutions - 1:
+                down.downsample = DownSampleBlock(channels)
+            else:
+                down.downsample = nn.Identity()
+            #
+            self.down.append(down)
+        # Final ResNet blocks with attention
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(channels, channels)
+        self.mid.attn_1 = AttentionBlock(channels)
+        self.mid.block_2 = ResnetBlock(channels, channels)
+        # Map to embedding space with a $3 \times 3$ convolution
+        self.norm_out = GroupNorm(channels)
+        self.conv_out = nn.Conv2d(channels, 2 * z_channels, 3, stride=1, padding=1)
+    def forward(self, img: torch.Tensor):
+        # Map to `channels` with the initial convolution
+        x = self.conv_in(img)
+        # Top-level blocks
+        for down in self.down:
+            # ResNet Blocks
+            for block in down.block:
+                x = block(x)
+            # Down-sampling
+            x = down.downsample(x)
+        # Final ResNet blocks with attention
+        x = self.mid.block_1(x)
+        x = self.mid.attn_1(x)
+        x = self.mid.block_2(x)
+        # Normalize and map to embedding space
+        x = self.norm_out(x)
+        x = F.silu(x)
+        x = self.conv_out(x)
+        return x
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        channels: int,
+        channel_multipliers: List[int],
+        n_resnet_blocks: int,
+        out_channels: int,
+        z_channels: int
+    ):
+        super().__init__()
+        # Number of blocks of different resolutions.
+        # The resolution is halved at the end each top level block
+        num_resolutions = len(channel_multipliers)
+        # Number of channels in each top level block, in the reverse order
+        channels_list = [m * channels for m in channel_multipliers]
+        # Number of channels in the  top-level block
+        channels = channels_list[-1]
+        # Initial $3 \times 3$ convolution layer that maps the embedding space to `channels`
+        self.conv_in = nn.Conv2d(z_channels, channels, 3, stride=1, padding=1)
+        # ResNet blocks with attention
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(channels, channels)
+        self.mid.attn_1 = AttentionBlock(channels)
+        self.mid.block_2 = ResnetBlock(channels, channels)
+        # List of top-level blocks
+        self.up = nn.ModuleList()
+        # Create top-level blocks
+        for i in reversed(range(num_resolutions)):
+            # Each top level block consists of multiple ResNet Blocks and up-sampling
+            resnet_blocks = nn.ModuleList()
+            # Add ResNet Blocks
+            for _ in range(n_resnet_blocks + 1):
+                resnet_blocks.append(ResnetBlock(channels, channels_list[i]))
+                channels = channels_list[i]
+            # Top-level block
+            up = nn.Module()
+            up.block = resnet_blocks
+            # Up-sampling at the end of each top level block except the first
+            if i != 0:
+                up.upsample = UpSampleBlock(channels)
+            else:
+                up.upsample = nn.Identity()
+            # Prepend to be consistent with the checkpoint
+            self.up.insert(0, up)
+        # Map to image space with a $3 \times 3$ convolution
+        self.norm_out = GroupNorm(channels)
+        self.conv_out = nn.Conv2d(channels, out_channels, 3, stride=1, padding=1)
+    def forward(self, z: torch.Tensor):
+        # Map to `channels` with the initial convolution
+        h = self.conv_in(z)
+        # ResNet blocks with attention
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # Top-level blocks
+        for up in reversed(self.up):
+            # ResNet Blocks
+            for block in up.block:
+                h = block(h)
+            # Up-sampling
+            h = up.upsample(h)
+        # Normalize and map to image space
+        h = self.norm_out(h)
+        h = F.silu(h)
+        img = self.conv_out(h)
+        return img
+class GaussianDistribution:
+    def __init__(self, parameters: torch.Tensor):
+        # Split mean and log of variance
+        self.mean, log_var = torch.chunk(parameters, 2, dim=1)
+        # Clamp the log of variances
+        self.log_var = torch.clamp(log_var, -30.0, 20.0)
+        # Calculate standard deviation
+        self.std = torch.exp(0.5 * self.log_var)
+    def sample(self):
+        # Sample from the distribution
+        return self.mean + self.std * torch.randn_like(self.std)
+    def mode(self):
+        return self.mean

swim/blocks.py CHANGED Viewed

@@ -36,23 +36,24 @@ class GroupNorm(nn.Module):
         return self.group_norm(x)
-class UpsampleBlock(nn.Module):
     def __init__(self, channels: int):
         super().__init__()
         self.conv = nn.Conv2d(channels, channels, 3, padding=1)
     def forward(self, x: torch.Tensor):
-        x = F.interpolate(x, scale_factor=2, mode="nearest")
         return self.conv(x)
-class DownsampleBlock(nn.Module):
     def __init__(self, channels: int):
         super().__init__()
-        self.op = nn.Conv2d(channels, channels, 3, stride=2, padding=1)
     def forward(self, x: torch.Tensor):
-        return self.op(x)
 class TimestepBlock(nn.Module):
@@ -128,12 +129,6 @@ class ResnetBlock(nn.Module):
 class AttentionBlock(nn.Module):
-    """Attention mechanism similar to transformers but for CNNs, paper https://arxiv.org/abs/1805.08318
-    Args:
-        in_channels (int): Number of channels in the input tensor.
-    """
     def __init__(self, in_channels: int) -> None:
         super().__init__()
@@ -183,3 +178,50 @@ class AttentionBlock(nn.Module):
         # adding the identity to the output
         return x + attention

         return self.group_norm(x)
+class UpSampleBlock(nn.Module):
     def __init__(self, channels: int):
         super().__init__()
         self.conv = nn.Conv2d(channels, channels, 3, padding=1)
     def forward(self, x: torch.Tensor):
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
         return self.conv(x)
+class DownSampleBlock(nn.Module):
     def __init__(self, channels: int):
         super().__init__()
+        self.conv = nn.Conv2d(channels, channels, 3, stride=2, padding=0)
     def forward(self, x: torch.Tensor):
+        x = F.pad(x, (0, 1, 0, 1), mode="constant", value=0)
+        return self.conv(x)
 class TimestepBlock(nn.Module):
 class AttentionBlock(nn.Module):
     def __init__(self, in_channels: int) -> None:
         super().__init__()
         # adding the identity to the output
         return x + attention
+class AttentionBlock(nn.Module):
+    def __init__(self, channels: int):
+        super().__init__()
+        # Group normalization
+        self.norm = GroupNorm(channels)
+        # Query, key and value mappings
+        self.q = nn.Conv2d(channels, channels, 1)
+        self.k = nn.Conv2d(channels, channels, 1)
+        self.v = nn.Conv2d(channels, channels, 1)
+        self.proj_out = nn.Conv2d(channels, channels, 1)
+        # Attention scaling factor
+        self.scale = channels**-0.5
+    def forward(self, x: torch.Tensor):
+        # Normalize `x`
+        x_norm = self.norm(x)
+        # Get query, key and vector embeddings
+        q = self.q(x_norm)
+        k = self.k(x_norm)
+        v = self.v(x_norm)
+        # Reshape to query, key and vector embeedings from
+        # `[batch_size, channels, height, width]` to
+        # `[batch_size, channels, height * width]`
+        b, c, h, w = q.shape
+        q = q.view(b, c, h * w)
+        k = k.view(b, c, h * w)
+        v = v.view(b, c, h * w)
+        # Compute $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_{key}}}\Bigg)$
+        attn = torch.einsum("bci,bcj->bij", q, k) * self.scale
+        attn = F.softmax(attn, dim=2)
+        # Compute $\underset{seq}{softmax}\Bigg(\frac{Q K^\top}{\sqrt{d_{key}}}\Bigg)V$
+        out = torch.einsum("bij,bcj->bci", attn, v)
+        # Reshape back to `[batch_size, channels, height, width]`
+        out = out.view(b, c, h, w)
+        # Final $1 \times 1$ convolution layer
+        out = self.proj_out(out)
+        # Add residual connection
+        return x + out

swim/codeblock.py CHANGED Viewed

@@ -2,7 +2,7 @@ import torch
 import torch.nn as nn
-class CodeBook(nn.Module):
     def __init__(
         self, num_codebook_vectors: int = 1024, latent_dim: int = 256, beta: int = 0.25
     ):

 import torch.nn as nn
+class SwimCodeBook(nn.Module):
     def __init__(
         self, num_codebook_vectors: int = 1024, latent_dim: int = 256, beta: int = 0.25
     ):

swim/encoder.py DELETED Viewed

@@ -1,90 +0,0 @@
-import torch
-import torch.nn as nn
-from .blocks import DownsampleBlock, GroupNorm, AttentionBlock, ResnetBlock
-class SwimEncoder(nn.Module):
-    """
-    The encoder part of the VQGAN.
-    Args:
-        img_channels (int): Number of channels in the input image.
-        image_size (int): Size of the input image, only used in encoder (height or width ).
-        latent_channels (int): Number of channels in the latent vector.
-        intermediate_channels (list): List of channels in the intermediate layers.
-        num_residual_blocks (int): Number of residual blocks b/w each downsample block.
-        dropout (float): Dropout probability for residual blocks.
-        attention_resolution (list): tensor size ( height or width ) at which to add attention blocks
-    """
-    def __init__(
-        self,
-        img_channels: int = 3,
-        image_size: int = 256,
-        latent_channels: int = 256,
-        intermediate_channels: list = [128, 128, 256, 256, 512],
-        num_residual_blocks: int = 2,
-        dropout: float = 0.0,
-        attention_resolution: list = [16],
-    ):
-        super().__init__()
-        # Inserting first intermediate channel to index 0
-        intermediate_channels.insert(0, intermediate_channels[0])
-        # Appends all the layers to this list
-        layers = []
-        # Addingt the first conv layer increase input channels to the first intermediate channels
-        layers.append(
-            nn.Conv2d(
-                img_channels,
-                intermediate_channels[0],
-                kernel_size=3,
-                stride=1,
-                padding=1,
-            )
-        )
-        # Loop over the intermediate channels except the last one
-        for n in range(len(intermediate_channels) - 1):
-            in_channels = intermediate_channels[n]
-            out_channels = intermediate_channels[n + 1]
-            # Adding the residual blocks for each channel
-            for _ in range(num_residual_blocks):
-                layers.append(ResnetBlock(in_channels, out_channels, dropout=dropout))
-                in_channels = out_channels
-                # Once we have downsampled the image to the size in attention resolution, we add attention blocks
-                if image_size in attention_resolution:
-                    layers.append(AttentionBlock(in_channels))
-            # only downsample for the first n-2 layers, and decrease the input size by a factor of 2
-            if n != len(intermediate_channels) - 2:
-                layers.append(DownsampleBlock(intermediate_channels[n + 1]))
-                image_size = image_size // 2  # Downsample by a factor of 2
-        in_channels = intermediate_channels[-1]
-        layers.extend(
-            [
-                ResnetBlock(
-                    in_channels=in_channels, out_channels=in_channels, dropout=dropout
-                ),
-                AttentionBlock(in_channels=in_channels),
-                ResnetBlock(
-                    in_channels=in_channels, out_channels=in_channels, dropout=dropout
-                ),
-                GroupNorm(in_channels=in_channels),
-                nn.SiLU(),
-                # increase the channels upto the latent vector channels
-                nn.Conv2d(
-                    in_channels, latent_channels, kernel_size=3, stride=1, padding=1
-                ),
-            ]
-        )
-        self.model = nn.Sequential(*layers)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.model(x)

train.py CHANGED Viewed

@@ -1,8 +1,72 @@
 import torch
 from torchinfo import summary
-from swim.encoder import SwimEncoder
-encoder = SwimEncoder().to("meta")
 sample = torch.randn(1, 3, 512, 512).to("meta")
-summary(encoder, input_data=(sample,))

 import torch
 from torchinfo import summary
+from swim.autoencoder import Autoencoder
+from diffusers import AutoencoderKL, UNet2DModel
+# vae = Autoencoder(
+#     z_channels=4,
+#     in_channels=3,
+#     channels=128,
+#     channel_multipliers=[1, 2, 4, 4],
+#     n_resnet_blocks=2,
+#     emb_channels=4,
+# ).to("meta")
+# lol_vae = AutoencoderKL.from_pretrained(
+#     "stabilityai/stable-diffusion-2-1", subfolder="vae"
+# ).to("meta")
+# # copy weights from lol_vae to vae
+# import json
+# with open("lolvae.json", "w") as f:
+#     json.dump(list(lol_vae.state_dict().keys()), f, indent=4)
+# with open("vae.json", "w") as f:
+#     json.dump(list(vae.state_dict().keys()), f, indent=4)
+# sample = torch.randn(1, 3, 512, 512).to("meta")
+# # lantent = vae.encoder(sample)
+from diffusers import UNet2DModel
+model = UNet2DModel(
+    sample_size=512,  # the target image resolution
+    in_channels=3,  # the number of input channels, 3 for RGB images
+    out_channels=3,  # the number of output channels
+    layers_per_block=2,  # how many ResNet layers to use per UNet block
+    block_out_channels=(
+        128,
+        128,
+        256,
+        256,
+        512,
+        512,
+    ),  # the number of output channels for each UNet block
+    down_block_types=(
+        "DownBlock2D",  # a regular ResNet downsampling block
+        "DownBlock2D",
+        "DownBlock2D",
+        "DownBlock2D",
+        "AttnDownBlock2D",  # a ResNet downsampling block with spatial self-attention
+        "DownBlock2D",
+    ),
+    up_block_types=(
+        "UpBlock2D",  # a regular ResNet upsampling block
+        "AttnUpBlock2D",  # a ResNet upsampling block with spatial self-attention
+        "UpBlock2D",
+        "UpBlock2D",
+        "UpBlock2D",
+        "UpBlock2D",
+    ),
+).to("meta")
 sample = torch.randn(1, 3, 512, 512).to("meta")
+summary(
+    model,
+    input_data=(
+        sample,
+        0,
+    ),
+)