ai-tube-model-ltxv-1

Paused

App Files Files Community

Sapir commited on 29 days ago

Commit

65dad79

•

1 Parent(s): f63ea56

VAE: Add timestep conditioning

Browse files

Files changed (2) hide show

xora/models/autoencoders/causal_video_autoencoder.py +164 -34
xora/models/autoencoders/vae.py +11 -4

xora/models/autoencoders/causal_video_autoencoder.py CHANGED Viewed

@@ -10,6 +10,8 @@ from einops import rearrange
 from torch import nn
 from diffusers.utils import logging
 import torch.nn.functional as F
 from xora.models.autoencoders.conv_nd_factory import make_conv_nd, make_linear_nd
 from xora.models.autoencoders.pixel_norm import PixelNorm
@@ -94,6 +96,7 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
             patch_size=config.get("patch_size", 1),
             norm_layer=config.get("norm_layer", "group_norm"),
             causal=config.get("causal_decoder", False),
         )
         dims = config["dims"]
@@ -122,6 +125,7 @@ class CausalVideoAutoencoder(AutoencoderKLWrapper):
             latent_log_var=self.encoder.latent_log_var,
             use_quant_conv=self.use_quant_conv,
             causal_decoder=self.decoder.causal,
         )
     @property
@@ -449,6 +453,7 @@ class Decoder(nn.Module):
         patch_size: int = 1,
         norm_layer: str = "group_norm",
         causal: bool = True,
     ):
         super().__init__()
         self.patch_size = patch_size
@@ -502,6 +507,7 @@ class Decoder(nn.Module):
                     norm_layer=norm_layer,
                     attention_head_dim=block_params["attention_head_dim"],
                     inject_noise=block_params.get("inject_noise", False),
                 )
             elif block_name == "res_x_y":
                 output_channel = output_channel // block_params.get("multiplier", 2)
@@ -513,6 +519,7 @@ class Decoder(nn.Module):
                     groups=norm_num_groups,
                     norm_layer=norm_layer,
                     inject_noise=block_params.get("inject_noise", False),
                 )
             elif block_name == "compress_time":
                 block = DepthToSpaceUpsample(
@@ -552,9 +559,28 @@ class Decoder(nn.Module):
         self.gradient_checkpointing = False
-    def forward(self, sample: torch.FloatTensor, target_shape) -> torch.FloatTensor:
         r"""The forward method of the `Decoder` class."""
         assert target_shape is not None, "target_shape must be provided"
         sample = self.conv_in(sample, causal=self.causal)
@@ -568,10 +594,46 @@ class Decoder(nn.Module):
         sample = sample.to(upscale_dtype)
         for up_block in self.up_blocks:
-            sample = checkpoint_fn(up_block)(sample, causal=self.causal)
         sample = self.conv_norm_out(sample)
         sample = self.conv_act(sample)
         sample = self.conv_out(sample, causal=self.causal)
@@ -731,11 +793,18 @@ class UNetMidBlock3D(nn.Module):
         resnet_groups: int = 32,
         norm_layer: str = "group_norm",
         inject_noise: bool = False,
     ):
         super().__init__()
         resnet_groups = (
             resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
         )
         self.res_blocks = nn.ModuleList(
             [
@@ -748,17 +817,38 @@ class UNetMidBlock3D(nn.Module):
                     dropout=dropout,
                     norm_layer=norm_layer,
                     inject_noise=inject_noise,
                 )
                 for _ in range(num_layers)
             ]
         )
     def forward(
-        self, hidden_states: torch.FloatTensor, causal: bool = True
     ) -> torch.FloatTensor:
         for resnet in self.res_blocks:
-            hidden_states = resnet(hidden_states, causal=causal)
         return hidden_states
@@ -846,6 +936,7 @@ class ResnetBlock3D(nn.Module):
         eps: float = 1e-6,
         norm_layer: str = "group_norm",
         inject_noise: bool = False,
     ):
         super().__init__()
         self.in_channels = in_channels
@@ -915,6 +1006,13 @@ class ResnetBlock3D(nn.Module):
             else nn.Identity()
         )
     def _feed_spatial_noise(
         self, hidden_states: torch.FloatTensor, per_channel_scale: torch.FloatTensor
     ) -> torch.FloatTensor:
@@ -933,10 +1031,29 @@ class ResnetBlock3D(nn.Module):
         self,
         input_tensor: torch.FloatTensor,
         causal: bool = True,
     ) -> torch.FloatTensor:
         hidden_states = input_tensor
         hidden_states = self.norm1(hidden_states)
         hidden_states = self.non_linearity(hidden_states)
@@ -949,6 +1066,9 @@ class ResnetBlock3D(nn.Module):
         hidden_states = self.norm2(hidden_states)
         hidden_states = self.non_linearity(hidden_states)
         hidden_states = self.dropout(hidden_states)
@@ -962,6 +1082,8 @@ class ResnetBlock3D(nn.Module):
         input_tensor = self.norm3(input_tensor)
         input_tensor = self.conv_shortcut(input_tensor)
         output_tensor = input_tensor + hidden_states
@@ -1013,35 +1135,42 @@ def unpatchify(x, patch_size_hw, patch_size_t=1):
 def create_video_autoencoder_config(
     latent_channels: int = 64,
 ):
-    config = {
         "_class_name": "CausalVideoAutoencoder",
-        "dims": 3,  # (2, 1),  # 2 for Conv2, 3 for Conv3d, (2, 1) for Conv2d followed by Conv1d
-        "in_channels": 3,  # Number of input color channels (e.g., RGB)
-        "out_channels": 3,  # Number of output color channels
-        "latent_channels": latent_channels,  # Number of channels in the latent space representation
-        "blocks": [
-            ("res_x", 4),
-            ("compress_space", 1),
-            ("res_x_y", 1),
-            ("res_x", 2),
-            ("compress_all", 1),
-            ("res_x", 3),
-            ("compress_all", 1),
-            ("res_x_y", 1),
-            ("res_x", 2),
-            ("compress_time", 1),
-            ("res_x", 3),
-            ("res_x", 3),
-        ],
         "patch_size": 4,
         "latent_log_var": "uniform",
         "use_quant_conv": False,
-        "norm_layer": "layer_norm",
-        "causal_decoder": True,
     }
-    return config
 def test_vae_patchify_unpatchify():
     import torch
@@ -1075,8 +1204,9 @@ def demo_video_autoencoder_forward_backward():
     print(f"input shape={input_videos.shape}")
     print(f"latent shape={latent.shape}")
     reconstructed_videos = video_autoencoder.decode(
-        latent, target_shape=input_videos.shape
     ).sample
     print(f"reconstructed shape={reconstructed_videos.shape}")
@@ -1084,16 +1214,16 @@ def demo_video_autoencoder_forward_backward():
     # Validate that single image gets treated the same way as first frame
     input_image = input_videos[:, :, :1, :, :]
     image_latent = video_autoencoder.encode(input_image).latent_dist.mode()
-    reconstructed_image = video_autoencoder.decode(
-        image_latent, target_shape=image_latent.shape
     ).sample
-    first_frame_latent = latent[:, :, :1, :, :]
     # assert torch.allclose(image_latent, first_frame_latent, atol=1e-6)
     # assert torch.allclose(reconstructed_image, reconstructed_videos[:, :, :1, :, :], atol=1e-6)
-    assert (image_latent == first_frame_latent).all()
-    assert (reconstructed_image == reconstructed_videos[:, :, :1, :, :]).all()
     # Calculate the loss (e.g., mean squared error)
     loss = torch.nn.functional.mse_loss(input_videos, reconstructed_videos)

 from torch import nn
 from diffusers.utils import logging
 import torch.nn.functional as F
+from diffusers.models.embeddings import PixArtAlphaCombinedTimestepSizeEmbeddings
 from xora.models.autoencoders.conv_nd_factory import make_conv_nd, make_linear_nd
 from xora.models.autoencoders.pixel_norm import PixelNorm
             patch_size=config.get("patch_size", 1),
             norm_layer=config.get("norm_layer", "group_norm"),
             causal=config.get("causal_decoder", False),
+            timestep_conditioning=config.get("timestep_conditioning", False),
         )
         dims = config["dims"]
             latent_log_var=self.encoder.latent_log_var,
             use_quant_conv=self.use_quant_conv,
             causal_decoder=self.decoder.causal,
+            timestep_conditioning=self.decoder.timestep_conditioning,
         )
     @property
         patch_size: int = 1,
         norm_layer: str = "group_norm",
         causal: bool = True,
+        timestep_conditioning: bool = False,
     ):
         super().__init__()
         self.patch_size = patch_size
                     norm_layer=norm_layer,
                     attention_head_dim=block_params["attention_head_dim"],
                     inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=timestep_conditioning,
                 )
             elif block_name == "res_x_y":
                 output_channel = output_channel // block_params.get("multiplier", 2)
                     groups=norm_num_groups,
                     norm_layer=norm_layer,
                     inject_noise=block_params.get("inject_noise", False),
+                    timestep_conditioning=False,
                 )
             elif block_name == "compress_time":
                 block = DepthToSpaceUpsample(
         self.gradient_checkpointing = False
+        self.timestep_conditioning = timestep_conditioning
+        if timestep_conditioning:
+            self.timestep_scale_multiplier = nn.Parameter(
+                torch.tensor(1000.0, dtype=torch.float32)
+            )
+            self.last_time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(
+                output_channel * 2, 0
+            )
+            self.last_scale_shift_table = nn.Parameter(
+                torch.randn(2, output_channel) / output_channel**0.5
+            )
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        target_shape,
+        timesteps: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
         r"""The forward method of the `Decoder` class."""
         assert target_shape is not None, "target_shape must be provided"
+        batch_size = sample.shape[0]
         sample = self.conv_in(sample, causal=self.causal)
         sample = sample.to(upscale_dtype)
+        if self.timestep_conditioning:
+            assert (
+                timesteps is not None
+            ), "should pass timesteps with timestep_conditioning=True"
+            scaled_timesteps = timesteps * self.timestep_scale_multiplier
         for up_block in self.up_blocks:
+            if self.timestep_conditioning and isinstance(up_block, UNetMidBlock3D):
+                sample = checkpoint_fn(up_block)(
+                    sample, causal=self.causal, timesteps=scaled_timesteps
+                )
+            else:
+                sample = checkpoint_fn(up_block)(sample, causal=self.causal)
         sample = self.conv_norm_out(sample)
+        if self.timestep_conditioning:
+            embedded_timesteps = self.last_time_embedder(
+                timestep=scaled_timesteps.flatten(),
+                resolution=None,
+                aspect_ratio=None,
+                batch_size=sample.shape[0],
+                hidden_dtype=sample.dtype,
+            )
+            embedded_timesteps = embedded_timesteps.view(
+                batch_size, embedded_timesteps.shape[-1], 1, 1, 1
+            )
+            ada_values = self.last_scale_shift_table[
+                None, ..., None, None, None
+            ] + embedded_timesteps.reshape(
+                batch_size,
+                2,
+                -1,
+                embedded_timesteps.shape[-3],
+                embedded_timesteps.shape[-2],
+                embedded_timesteps.shape[-1],
+            )
+            shift, scale = ada_values.unbind(dim=1)
+            sample = sample * (1 + scale) + shift
         sample = self.conv_act(sample)
         sample = self.conv_out(sample, causal=self.causal)
         resnet_groups: int = 32,
         norm_layer: str = "group_norm",
         inject_noise: bool = False,
+        timestep_conditioning: bool = False,
     ):
         super().__init__()
         resnet_groups = (
             resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
         )
+        self.timestep_conditioning = timestep_conditioning
+        if timestep_conditioning:
+            self.time_embedder = PixArtAlphaCombinedTimestepSizeEmbeddings(
+                in_channels * 4, 0
+            )
         self.res_blocks = nn.ModuleList(
             [
                     dropout=dropout,
                     norm_layer=norm_layer,
                     inject_noise=inject_noise,
+                    timestep_conditioning=timestep_conditioning,
                 )
                 for _ in range(num_layers)
             ]
         )
     def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        causal: bool = True,
+        timesteps: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
+        timestep_embed = None
+        if self.timestep_conditioning:
+            assert (
+                timesteps is not None
+            ), "should pass timesteps with timestep_conditioning=True"
+            batch_size = hidden_states.shape[0]
+            timestep_embed = self.time_embedder(
+                timestep=timesteps.flatten(),
+                resolution=None,
+                aspect_ratio=None,
+                batch_size=batch_size,
+                hidden_dtype=hidden_states.dtype,
+            )
+            timestep_embed = timestep_embed.view(
+                batch_size, timestep_embed.shape[-1], 1, 1, 1
+            )
         for resnet in self.res_blocks:
+            hidden_states = resnet(
+                hidden_states, causal=causal, timesteps=timestep_embed
+            )
         return hidden_states
         eps: float = 1e-6,
         norm_layer: str = "group_norm",
         inject_noise: bool = False,
+        timestep_conditioning: bool = False,
     ):
         super().__init__()
         self.in_channels = in_channels
             else nn.Identity()
         )
+        self.timestep_conditioning = timestep_conditioning
+        if timestep_conditioning:
+            self.scale_shift_table = nn.Parameter(
+                torch.randn(4, in_channels) / in_channels**0.5
+            )
     def _feed_spatial_noise(
         self, hidden_states: torch.FloatTensor, per_channel_scale: torch.FloatTensor
     ) -> torch.FloatTensor:
         self,
         input_tensor: torch.FloatTensor,
         causal: bool = True,
+        timesteps: Optional[torch.Tensor] = None,
     ) -> torch.FloatTensor:
         hidden_states = input_tensor
+        batch_size = hidden_states.shape[0]
         hidden_states = self.norm1(hidden_states)
+        if self.timestep_conditioning:
+            assert (
+                timesteps is not None
+            ), "should pass timesteps with timestep_conditioning=True"
+            ada_values = self.scale_shift_table[
+                None, ..., None, None, None
+            ] + timesteps.reshape(
+                batch_size,
+                4,
+                -1,
+                timesteps.shape[-3],
+                timesteps.shape[-2],
+                timesteps.shape[-1],
+            )
+            shift1, scale1, shift2, scale2 = ada_values.unbind(dim=1)
+            hidden_states = hidden_states * (1 + scale1) + shift1
         hidden_states = self.non_linearity(hidden_states)
         hidden_states = self.norm2(hidden_states)
+        if self.timestep_conditioning:
+            hidden_states = hidden_states * (1 + scale2) + shift2
         hidden_states = self.non_linearity(hidden_states)
         hidden_states = self.dropout(hidden_states)
         input_tensor = self.norm3(input_tensor)
+        batch_size = input_tensor.shape[0]
         input_tensor = self.conv_shortcut(input_tensor)
         output_tensor = input_tensor + hidden_states
 def create_video_autoencoder_config(
     latent_channels: int = 64,
 ):
+    encoder_blocks = [
+        ("res_x", {"num_layers": 4}),
+        ("compress_all_x_y", {"multiplier": 3}),
+        ("res_x", {"num_layers": 4}),
+        ("compress_all_x_y", {"multiplier": 2}),
+        ("res_x", {"num_layers": 4}),
+        ("compress_all", {}),
+        ("res_x", {"num_layers": 3}),
+        ("res_x", {"num_layers": 4}),
+    ]
+    decoder_blocks = [
+        ("res_x", {"num_layers": 4}),
+        ("compress_all", {"residual": True}),
+        ("res_x_y", {"multiplier": 3}),
+        ("res_x", {"num_layers": 3}),
+        ("compress_all", {"residual": True}),
+        ("res_x_y", {"multiplier": 2}),
+        ("res_x", {"num_layers": 3}),
+        ("compress_all", {"residual": True}),
+        ("res_x", {"num_layers": 3}),
+        ("res_x", {"num_layers": 4}),
+    ]
+    return {
         "_class_name": "CausalVideoAutoencoder",
+        "dims": 3,
+        "encoder_blocks": encoder_blocks,
+        "decoder_blocks": decoder_blocks,
+        "latent_channels": latent_channels,
+        "norm_layer": "pixel_norm",
         "patch_size": 4,
         "latent_log_var": "uniform",
         "use_quant_conv": False,
+        "causal_decoder": False,
+        "timestep_conditioning": True,
     }
 def test_vae_patchify_unpatchify():
     import torch
     print(f"input shape={input_videos.shape}")
     print(f"latent shape={latent.shape}")
+    timesteps = torch.ones(input_videos.shape[0]) * 0.1
     reconstructed_videos = video_autoencoder.decode(
+        latent, target_shape=input_videos.shape, timesteps=timesteps
     ).sample
     print(f"reconstructed shape={reconstructed_videos.shape}")
     # Validate that single image gets treated the same way as first frame
     input_image = input_videos[:, :, :1, :, :]
     image_latent = video_autoencoder.encode(input_image).latent_dist.mode()
+    _ = video_autoencoder.decode(
+        image_latent, target_shape=image_latent.shape, timesteps=timesteps
     ).sample
+    # first_frame_latent = latent[:, :, :1, :, :]
     # assert torch.allclose(image_latent, first_frame_latent, atol=1e-6)
     # assert torch.allclose(reconstructed_image, reconstructed_videos[:, :, :1, :, :], atol=1e-6)
+    # assert (image_latent == first_frame_latent).all()
+    # assert (reconstructed_image == reconstructed_videos[:, :, :1, :, :]).all()
     # Calculate the loss (e.g., mean squared error)
     loss = torch.nn.functional.mse_loss(input_videos, reconstructed_videos)

xora/models/autoencoders/vae.py CHANGED Viewed

@@ -251,14 +251,21 @@ class AutoencoderKLWrapper(ModelMixin, ConfigMixin):
         return moments
     def _decode(
-        self, z: torch.FloatTensor, target_shape=None
     ) -> Union[DecoderOutput, torch.FloatTensor]:
         z = self.post_quant_conv(z)
-        dec = self.decoder(z, target_shape=target_shape)
         return dec
     def decode(
-        self, z: torch.FloatTensor, return_dict: bool = True, target_shape=None
     ) -> Union[DecoderOutput, torch.FloatTensor]:
         assert target_shape is not None, "target_shape must be provided for decoding"
         if self.use_z_tiling and z.shape[2] > self.z_sample_size > 1:
@@ -291,7 +298,7 @@ class AutoencoderKLWrapper(ModelMixin, ConfigMixin):
             decoded = (
                 self._hw_tiled_decode(z, target_shape)
                 if self.use_hw_tiling
-                else self._decode(z, target_shape=target_shape)
             )
         if not return_dict:

         return moments
     def _decode(
+        self,
+        z: torch.FloatTensor,
+        target_shape=None,
+        timesteps: Optional[torch.Tensor] = None,
     ) -> Union[DecoderOutput, torch.FloatTensor]:
         z = self.post_quant_conv(z)
+        dec = self.decoder(z, target_shape=target_shape, timesteps=timesteps)
         return dec
     def decode(
+        self,
+        z: torch.FloatTensor,
+        return_dict: bool = True,
+        target_shape=None,
+        timesteps: Optional[torch.Tensor] = None,
     ) -> Union[DecoderOutput, torch.FloatTensor]:
         assert target_shape is not None, "target_shape must be provided for decoding"
         if self.use_z_tiling and z.shape[2] > self.z_sample_size > 1:
             decoded = (
                 self._hw_tiled_decode(z, target_shape)
                 if self.use_hw_tiling
+                else self._decode(z, target_shape=target_shape, timesteps=timesteps)
             )
         if not return_dict: