demo_test / ppdiffusers /models /prior_transformer.py
xianbao's picture
xianbao HF staff
Upload with huggingface_hub
72895aa
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from typing import Optional, Union
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ..configuration_utils import ConfigMixin, register_to_config
from ..modeling_utils import ModelMixin
from ..utils import BaseOutput
from .attention import BasicTransformerBlock
from .embeddings import TimestepEmbedding, Timesteps
NEG_INF = -1e4
@dataclass
class PriorTransformerOutput(BaseOutput):
"""
Args:
predicted_image_embedding (`paddle.Tensor` of shape `(batch_size, embedding_dim)`):
The predicted CLIP image embedding conditioned on the CLIP text embedding input.
"""
predicted_image_embedding: paddle.Tensor
class PriorTransformer(ModelMixin, ConfigMixin):
"""
The prior transformer from unCLIP is used to predict CLIP image embeddings from CLIP text embeddings. Note that the
transformer predicts the image embeddings through a denoising diffusion process.
This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
implements for all the models (such as downloading or saving, etc.)
For more details, see the original paper: https://arxiv.org/abs/2204.06125
Parameters:
num_attention_heads (`int`, *optional*, defaults to 32): The number of heads to use for multi-head attention.
attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use.
embedding_dim (`int`, *optional*, defaults to 768): The dimension of the CLIP embeddings. Note that CLIP
image embeddings and text embeddings are both the same dimension.
num_embeddings (`int`, *optional*, defaults to 77): The max number of clip embeddings allowed. I.e. the
length of the prompt after it has been tokenized.
additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the
projected hidden_states. The actual length of the used hidden_states is `num_embeddings +
additional_embeddings`.
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
"""
@register_to_config
def __init__(
self,
num_attention_heads: int = 32,
attention_head_dim: int = 64,
num_layers: int = 20,
embedding_dim: int = 768,
num_embeddings=77,
additional_embeddings=4,
dropout: float = 0.0,
):
super().__init__()
self.num_attention_heads = num_attention_heads
self.attention_head_dim = attention_head_dim
inner_dim = num_attention_heads * attention_head_dim
self.additional_embeddings = additional_embeddings
self.time_proj = Timesteps(inner_dim, True, 0)
self.time_embedding = TimestepEmbedding(inner_dim, inner_dim)
self.proj_in = nn.Linear(embedding_dim, inner_dim)
self.embedding_proj = nn.Linear(embedding_dim, inner_dim)
self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim)
self.positional_embedding = self.create_parameter(
(1, num_embeddings + additional_embeddings, inner_dim),
dtype=paddle.get_default_dtype(),
default_initializer=nn.initializer.Constant(0.0),
)
self.prd_embedding = self.create_parameter(
(1, 1, inner_dim), dtype=paddle.get_default_dtype(), default_initializer=nn.initializer.Constant(0.0)
)
self.transformer_blocks = nn.LayerList(
[
BasicTransformerBlock(
inner_dim,
num_attention_heads,
attention_head_dim,
dropout=dropout,
activation_fn="gelu",
attention_bias=True,
)
for d in range(num_layers)
]
)
self.norm_out = nn.LayerNorm(inner_dim)
self.proj_to_clip_embeddings = nn.Linear(inner_dim, embedding_dim)
causal_attention_mask = paddle.triu(
paddle.full([num_embeddings + additional_embeddings, num_embeddings + additional_embeddings], NEG_INF), 1
)
causal_attention_mask = causal_attention_mask.unsqueeze(0)
self.register_buffer("causal_attention_mask", causal_attention_mask, persistable=False)
self.clip_mean = self.create_parameter(
(1, embedding_dim), dtype=paddle.get_default_dtype(), default_initializer=nn.initializer.Constant(0.0)
)
self.clip_std = self.create_parameter(
(1, embedding_dim), dtype=paddle.get_default_dtype(), default_initializer=nn.initializer.Constant(0.0)
)
def forward(
self,
hidden_states,
timestep: Union[paddle.Tensor, float, int],
proj_embedding: paddle.Tensor,
encoder_hidden_states: paddle.Tensor,
attention_mask: Optional[paddle.Tensor] = None,
return_dict: bool = True,
):
"""
Args:
hidden_states (`paddle.Tensor` of shape `(batch_size, embedding_dim)`):
x_t, the currently predicted image embeddings.
timestep (`paddle.Tensor`):
Current denoising step.
proj_embedding (`paddle.Tensor` of shape `(batch_size, embedding_dim)`):
Projected embedding vector the denoising process is conditioned on.
encoder_hidden_states (`paddle.Tensor` of shape `(batch_size, num_embeddings, embedding_dim)`):
Hidden states of the text embeddings the denoising process is conditioned on.
attention_mask (`paddle.Tensor` of shape `(batch_size, num_embeddings)`):
Text mask for the text embeddings.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`models.prior_transformer.PriorTransformerOutput`] instead of a plain
tuple.
Returns:
[`~models.prior_transformer.PriorTransformerOutput`] or `tuple`:
[`~models.prior_transformer.PriorTransformerOutput`] if `return_dict` is True, otherwise a `tuple`. When
returning a tuple, the first element is the sample tensor.
"""
batch_size = hidden_states.shape[0]
timesteps = timestep
if not paddle.is_tensor(timesteps):
timesteps = paddle.to_tensor([timesteps], dtype=paddle.int64)
elif paddle.is_tensor(timesteps) and len(timesteps.shape) == 0:
timesteps = timesteps[None]
# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
timesteps = timesteps * paddle.ones((batch_size,), dtype=timesteps.dtype)
timesteps_projected = self.time_proj(timesteps)
# timesteps does not contain any weights and will always return f32 tensors
# but time_embedding might be fp16, so we need to cast here.
timesteps_projected = timesteps_projected.cast(dtype=self.dtype)
time_embeddings = self.time_embedding(timesteps_projected)
proj_embeddings = self.embedding_proj(proj_embedding)
encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states)
hidden_states = self.proj_in(hidden_states)
prd_embedding = self.prd_embedding.cast(hidden_states.dtype).expand([batch_size, -1, -1])
positional_embeddings = self.positional_embedding.cast(hidden_states.dtype)
hidden_states = paddle.concat(
[
encoder_hidden_states,
proj_embeddings[:, None, :],
time_embeddings[:, None, :],
hidden_states[:, None, :],
prd_embedding,
],
axis=1,
)
hidden_states = hidden_states + positional_embeddings
if attention_mask is not None:
attention_mask = (1 - attention_mask.cast(hidden_states.dtype)) * -10000.0
attention_mask = F.pad(
attention_mask.unsqueeze(0), (0, self.additional_embeddings), value=0.0, data_format="NCL"
).squeeze(0)
attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).cast(hidden_states.dtype)
attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, axis=0)
for block in self.transformer_blocks:
hidden_states = block(hidden_states, attention_mask=attention_mask)
hidden_states = self.norm_out(hidden_states)
hidden_states = hidden_states[:, -1]
predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states)
if not return_dict:
return (predicted_image_embedding,)
return PriorTransformerOutput(predicted_image_embedding=predicted_image_embedding)
def post_process_latents(self, prior_latents):
prior_latents = (prior_latents * self.clip_std) + self.clip_mean
return prior_latents