Spaces:

yslan
/

ObjCtrl-2.5D

Running on Zero

File size: 15,918 Bytes

38e3f9b

from dataclasses import dataclass
from typing import Callable, Optional

import torch
from torch import nn

from diffusers.utils import BaseOutput
from diffusers.models.attention_processor import Attention
from diffusers.models.attention import FeedForward

from typing import Dict, Any
from cameractrl.models.attention_processor import PoseAdaptorAttnProcessor

from einops import rearrange
import math


class InflatedGroupNorm(nn.GroupNorm):
    def forward(self, x):
        # return super().forward(x)

        video_length = x.shape[2]

        x = rearrange(x, "b c f h w -> (b f) c h w")
        x = super().forward(x)
        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)

        return x

def zero_module(module):
    # Zero out the parameters of a module and return it.
    for p in module.parameters():
        p.detach().zero_()
    return module


@dataclass
class TemporalTransformer3DModelOutput(BaseOutput):
    sample: torch.FloatTensor


def get_motion_module(
        in_channels,
        motion_module_type: str,
        motion_module_kwargs: dict
):
    if motion_module_type == "Vanilla":
        return VanillaTemporalModule(in_channels=in_channels, **motion_module_kwargs)
    else:
        raise ValueError


class VanillaTemporalModule(nn.Module):
    def __init__(
            self,
            in_channels,
            num_attention_heads=8,
            num_transformer_block=2,
            attention_block_types=("Temporal_Self",),
            temporal_position_encoding=True,
            temporal_position_encoding_max_len=32,
            temporal_attention_dim_div=1,
            cross_attention_dim=320,
            zero_initialize=True,
            encoder_hidden_states_query=(False, False),
            attention_activation_scale=1.0,
            attention_processor_kwargs: Dict = {},
            causal_temporal_attention=False,
            causal_temporal_attention_mask_type="",
            rescale_output_factor=1.0
    ):
        super().__init__()

        self.temporal_transformer = TemporalTransformer3DModel(
            in_channels=in_channels,
            num_attention_heads=num_attention_heads,
            attention_head_dim=in_channels // num_attention_heads // temporal_attention_dim_div,
            num_layers=num_transformer_block,
            attention_block_types=attention_block_types,
            cross_attention_dim=cross_attention_dim,
            temporal_position_encoding=temporal_position_encoding,
            temporal_position_encoding_max_len=temporal_position_encoding_max_len,
            encoder_hidden_states_query=encoder_hidden_states_query,
            attention_activation_scale=attention_activation_scale,
            attention_processor_kwargs=attention_processor_kwargs,
            causal_temporal_attention=causal_temporal_attention,
            causal_temporal_attention_mask_type=causal_temporal_attention_mask_type,
            rescale_output_factor=rescale_output_factor
        )

        if zero_initialize:
            self.temporal_transformer.proj_out = zero_module(self.temporal_transformer.proj_out)

    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None,
                cross_attention_kwargs: Dict[str, Any] = {}):
        hidden_states = self.temporal_transformer(hidden_states, encoder_hidden_states, attention_mask, cross_attention_kwargs=cross_attention_kwargs)

        output = hidden_states
        return output


class TemporalTransformer3DModel(nn.Module):
    def __init__(
            self,
            in_channels,
            num_attention_heads,
            attention_head_dim,
            num_layers,
            attention_block_types=("Temporal_Self", "Temporal_Self",),
            dropout=0.0,
            norm_num_groups=32,
            cross_attention_dim=320,
            activation_fn="geglu",
            attention_bias=False,
            upcast_attention=False,
            temporal_position_encoding=False,
            temporal_position_encoding_max_len=32,
            encoder_hidden_states_query=(False, False),
            attention_activation_scale=1.0,
            attention_processor_kwargs: Dict = {},

            causal_temporal_attention=None,
            causal_temporal_attention_mask_type="",
            rescale_output_factor=1.0
    ):
        super().__init__()
        assert causal_temporal_attention is not None
        self.causal_temporal_attention = causal_temporal_attention

        assert (not causal_temporal_attention) or (causal_temporal_attention_mask_type != "")
        self.causal_temporal_attention_mask_type = causal_temporal_attention_mask_type
        self.causal_temporal_attention_mask = None

        inner_dim = num_attention_heads * attention_head_dim

        self.norm = InflatedGroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
        self.proj_in = nn.Linear(in_channels, inner_dim)

        self.transformer_blocks = nn.ModuleList(
            [
                TemporalTransformerBlock(
                    dim=inner_dim,
                    num_attention_heads=num_attention_heads,
                    attention_head_dim=attention_head_dim,
                    attention_block_types=attention_block_types,
                    dropout=dropout,
                    norm_num_groups=norm_num_groups,
                    cross_attention_dim=cross_attention_dim,
                    activation_fn=activation_fn,
                    attention_bias=attention_bias,
                    upcast_attention=upcast_attention,
                    temporal_position_encoding=temporal_position_encoding,
                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
                    encoder_hidden_states_query=encoder_hidden_states_query,
                    attention_activation_scale=attention_activation_scale,
                    attention_processor_kwargs=attention_processor_kwargs,
                    rescale_output_factor=rescale_output_factor,
                )
                for d in range(num_layers)
            ]
        )
        self.proj_out = nn.Linear(inner_dim, in_channels)

    def get_causal_temporal_attention_mask(self, hidden_states):
        batch_size, sequence_length, dim = hidden_states.shape

        if self.causal_temporal_attention_mask is None or self.causal_temporal_attention_mask.shape != (
        batch_size, sequence_length, sequence_length):
            if self.causal_temporal_attention_mask_type == "causal":
                # 1. vanilla causal mask
                mask = torch.tril(torch.ones(sequence_length, sequence_length))

            elif self.causal_temporal_attention_mask_type == "2-seq":
                # 2. 2-seq
                mask = torch.zeros(sequence_length, sequence_length)
                mask[:sequence_length // 2, :sequence_length // 2] = 1
                mask[-sequence_length // 2:, -sequence_length // 2:] = 1

            elif self.causal_temporal_attention_mask_type == "0-prev":
                # attn to the previous frame
                indices = torch.arange(sequence_length)
                indices_prev = indices - 1
                indices_prev[0] = 0
                mask = torch.zeros(sequence_length, sequence_length)
                mask[:, 0] = 1.
                mask[indices, indices_prev] = 1.

            elif self.causal_temporal_attention_mask_type == "0":
                # only attn to first frame
                mask = torch.zeros(sequence_length, sequence_length)
                mask[:, 0] = 1

            elif self.causal_temporal_attention_mask_type == "wo-self":
                indices = torch.arange(sequence_length)
                mask = torch.ones(sequence_length, sequence_length)
                mask[indices, indices] = 0

            elif self.causal_temporal_attention_mask_type == "circle":
                indices = torch.arange(sequence_length)
                indices_prev = indices - 1
                indices_prev[0] = 0

                mask = torch.eye(sequence_length)
                mask[indices, indices_prev] = 1
                mask[0, -1] = 1

            else:
                raise ValueError

            # generate attention mask fron binary values
            mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
            mask = mask.unsqueeze(0)
            mask = mask.repeat(batch_size, 1, 1)

            self.causal_temporal_attention_mask = mask.to(hidden_states.device)

        return self.causal_temporal_attention_mask

    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None,
                cross_attention_kwargs: Dict[str, Any] = {},):
        residual = hidden_states

        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
        height, width = hidden_states.shape[-2:]

        hidden_states = self.norm(hidden_states)
        hidden_states = rearrange(hidden_states, "b c f h w -> (b h w) f c")
        hidden_states = self.proj_in(hidden_states)

        attention_mask = self.get_causal_temporal_attention_mask(
            hidden_states) if self.causal_temporal_attention else attention_mask

        # Transformer Blocks
        for block in self.transformer_blocks:
            hidden_states = block(hidden_states, encoder_hidden_states=encoder_hidden_states,
                                  attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs)
        hidden_states = self.proj_out(hidden_states)

        hidden_states = rearrange(hidden_states, "(b h w) f c -> b c f h w", h=height, w=width)

        output = hidden_states + residual

        return output


class TemporalTransformerBlock(nn.Module):
    def __init__(
            self,
            dim,
            num_attention_heads,
            attention_head_dim,
            attention_block_types=("Temporal_Self", "Temporal_Self",),
            dropout=0.0,
            norm_num_groups=32,
            cross_attention_dim=768,
            activation_fn="geglu",
            attention_bias=False,
            upcast_attention=False,
            temporal_position_encoding=False,
            temporal_position_encoding_max_len=32,
            encoder_hidden_states_query=(False, False),
            attention_activation_scale=1.0,
            attention_processor_kwargs: Dict = {},
            rescale_output_factor=1.0
    ):
        super().__init__()

        attention_blocks = []
        norms = []
        self.attention_block_types = attention_block_types

        for block_idx, block_name in enumerate(attention_block_types):
            attention_blocks.append(
                TemporalSelfAttention(
                    attention_mode=block_name,
                    cross_attention_dim=cross_attention_dim if block_name in ['Temporal_Cross', 'Temporal_Pose_Adaptor'] else None,
                    query_dim=dim,
                    heads=num_attention_heads,
                    dim_head=attention_head_dim,
                    dropout=dropout,
                    bias=attention_bias,
                    upcast_attention=upcast_attention,
                    temporal_position_encoding=temporal_position_encoding,
                    temporal_position_encoding_max_len=temporal_position_encoding_max_len,
                    rescale_output_factor=rescale_output_factor,
                )
            )
            norms.append(nn.LayerNorm(dim))

        self.attention_blocks = nn.ModuleList(attention_blocks)
        self.norms = nn.ModuleList(norms)

        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
        self.ff_norm = nn.LayerNorm(dim)

    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs: Dict[str, Any] = {}):
        for attention_block, norm, attention_block_type in zip(self.attention_blocks, self.norms, self.attention_block_types):
            norm_hidden_states = norm(hidden_states)
            hidden_states = attention_block(
                norm_hidden_states,
                encoder_hidden_states=norm_hidden_states if attention_block_type == 'Temporal_Self' else encoder_hidden_states,
                attention_mask=attention_mask,
                **cross_attention_kwargs
            ) + hidden_states

        hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states

        output = hidden_states
        return output


class PositionalEncoding(nn.Module):
    def __init__(
            self,
            d_model,
            dropout=0.,
            max_len=32,
    ):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)


class TemporalSelfAttention(Attention):
    def __init__(
            self,
            attention_mode=None,
            temporal_position_encoding=False,
            temporal_position_encoding_max_len=32,
            rescale_output_factor=1.0,
            *args, **kwargs
    ):
        super().__init__(*args, **kwargs)
        assert attention_mode == "Temporal_Self"

        self.pos_encoder = PositionalEncoding(
            kwargs["query_dim"],
            max_len=temporal_position_encoding_max_len
        ) if temporal_position_encoding else None
        self.rescale_output_factor = rescale_output_factor

    def set_use_memory_efficient_attention_xformers(
            self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None
    ):
        # disable motion module efficient xformers to avoid bad results, don't know why
        # TODO: fix this bug
        pass

    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
        # The `Attention` class can call different attention processors / attention functions
        # here we simply pass along all tensors to the selected processor class
        # For standard processors that are defined here, `**cross_attention_kwargs` is empty

        # add position encoding
        if self.pos_encoder is not None:
            hidden_states = self.pos_encoder(hidden_states)
        if "pose_feature" in cross_attention_kwargs:
            pose_feature = cross_attention_kwargs["pose_feature"]
            if pose_feature.ndim == 5:
                pose_feature = rearrange(pose_feature, "b c f h w -> (b h w) f c")
            else:
                assert pose_feature.ndim == 3
            cross_attention_kwargs["pose_feature"] = pose_feature

        if isinstance(self.processor,  PoseAdaptorAttnProcessor):
            return self.processor(
                self,
                hidden_states,
                cross_attention_kwargs.pop('pose_feature'),
                encoder_hidden_states=None,
                attention_mask=attention_mask,
                **cross_attention_kwargs,
            )
        elif hasattr(self.processor, "__call__"):
            return self.processor.__call__(
                    self,
                    hidden_states,
                    encoder_hidden_states=None,
                    attention_mask=attention_mask,
                    **cross_attention_kwargs,
                )
        else:
            return self.processor(
                self,
                hidden_states,
                encoder_hidden_states=None,
                attention_mask=attention_mask,
                **cross_attention_kwargs,
            )