Spaces:

yslan
/

LN3Diff_I23D

Running on Zero

LN3Diff_I23D / dit /dit_decoder.py

NIRVANALAN

init

11e6f7b 3 months ago

8.96 kB

	import torch
	import torch.nn as nn
	import numpy as np
	import math

	from einops import rearrange
	from pdb import set_trace as st

	# from .dit_models import DiT, DiTBlock, DiT_models, get_2d_sincos_pos_embed, modulate, FinalLayer

	from .dit_models_xformers import DiT, DiTBlock, DiT_models, get_2d_sincos_pos_embed, modulate, FinalLayer
	# from .dit_models import DiT, DiTBlock, DiT_models, get_2d_sincos_pos_embed, modulate, FinalLayer


	def modulate2(x, shift, scale):
	return x * (1 + scale) + shift


	class DiTBlock2(DiTBlock):
	"""
	A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
	"""

	def __init__(self, hidden_size, num_heads, mlp_ratio=4, **block_kwargs):
	super().__init__(hidden_size, num_heads, mlp_ratio, **block_kwargs)

	def forward(self, x, c):
	shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(
	c).chunk(6, dim=-1)
	# st()
	x = x + gate_msa * self.attn(
	modulate2(self.norm1(x), shift_msa, scale_msa))
	x = x + gate_mlp * self.mlp(
	modulate2(self.norm2(x), shift_mlp, scale_mlp))
	return x


	class FinalLayer2(FinalLayer):
	"""
	The final layer of DiT, basically the decoder_pred in MAE with adaLN.
	"""

	def __init__(self, hidden_size, patch_size, out_channels):
	super().__init__(hidden_size, patch_size, out_channels)

	def forward(self, x, c):
	shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
	x = modulate2(self.norm_final(x), shift, scale)
	x = self.linear(x)
	return x


	class DiT2(DiT):
	# a conditional ViT
	def __init__(self,
	input_size=32,
	patch_size=2,
	in_channels=4,
	hidden_size=1152,
	depth=28,
	num_heads=16,
	mlp_ratio=4,
	class_dropout_prob=0.1,
	num_classes=1000,
	learn_sigma=True,
	mixing_logit_init=-3,
	mixed_prediction=True,
	context_dim=False,
	roll_out=False,
	plane_n=3,
	return_all_layers=False,
	vit_blk=...):
	super().__init__(input_size,
	patch_size,
	in_channels,
	hidden_size,
	depth,
	num_heads,
	mlp_ratio,
	class_dropout_prob,
	num_classes,
	learn_sigma,
	mixing_logit_init,
	mixed_prediction,
	context_dim,
	roll_out,
	vit_blk=DiTBlock2,
	final_layer_blk=FinalLayer2)

	# no t and x embedder
	del self.x_embedder
	del self.t_embedder
	del self.final_layer
	torch.cuda.empty_cache()
	self.clip_text_proj = None
	self.plane_n = plane_n
	self.return_all_layers = return_all_layers

	def forward(self, c, args, *kwargs):
	# return super().forward(x, timesteps, context, y, get_attr, **kwargs)
	"""
	Forward pass of DiT.
	c: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
	"""
	x = self.pos_embed.repeat(
	c.shape[0], 1, 1) # (N, T, D), where T = H * W / patch_size ** 2

	if self.return_all_layers:
	all_layers = []

	# if context is not None:
	# c = context # B 3HW C

	for blk_idx, block in enumerate(self.blocks):
	if self.roll_out:
	if blk_idx % 2 == 0: # with-in plane self attention
	x = rearrange(x, 'b (n l) c -> (b n) l c ', n=self.plane_n)
	x = block(x,
	rearrange(c,
	'b (n l) c -> (b n) l c ',
	n=self.plane_n)) # (N, T, D)
	# st()
	if self.return_all_layers:
	all_layers.append(x)
	else: # global attention
	x = rearrange(x, '(b n) l c -> b (n l) c ', n=self.plane_n)
	x = block(x, c) # (N, T, D)
	# st()
	if self.return_all_layers:
	# all merged into B dim
	all_layers.append(
	rearrange(x,
	'b (n l) c -> (b n) l c',
	n=self.plane_n))
	else:
	x = block(x, c) # (N, T, D)

	# x = self.final_layer(x, c) # (N, T, patch_size ** 2 * out_channels)

	# if self.roll_out: # move n from L to B axis
	# x = rearrange(x, 'b (n l) c ->(b n) l c', n=3)

	# x = self.unpatchify(x) # (N, out_channels, H, W)

	# if self.roll_out: # move n from L to B axis
	# x = rearrange(x, '(b n) c h w -> b (n c) h w', n=3)

	if self.return_all_layers:
	return all_layers
	else:
	return x


	# class DiT2_DPT(DiT2):
	# def __init__(self, input_size=32, patch_size=2, in_channels=4, hidden_size=1152, depth=28, num_heads=16, mlp_ratio=4, class_dropout_prob=0.1, num_classes=1000, learn_sigma=True, mixing_logit_init=-3, mixed_prediction=True, context_dim=False, roll_out=False, plane_n=3, vit_blk=...):
	# super().__init__(input_size, patch_size, in_channels, hidden_size, depth, num_heads, mlp_ratio, class_dropout_prob, num_classes, learn_sigma, mixing_logit_init, mixed_prediction, context_dim, roll_out, plane_n, vit_blk)
	# self.return_all_layers = True

	#################################################################################
	# DiT2 Configs #
	#################################################################################


	def DiT2_XL_2(**kwargs):
	return DiT2(depth=28,
	hidden_size=1152,
	patch_size=2,
	num_heads=16,
	**kwargs)


	def DiT2_XL_2_half(**kwargs):
	return DiT2(depth=28 // 2,
	hidden_size=1152,
	patch_size=2,
	num_heads=16,
	**kwargs)


	def DiT2_XL_4(**kwargs):
	return DiT2(depth=28,
	hidden_size=1152,
	patch_size=4,
	num_heads=16,
	**kwargs)


	def DiT2_XL_8(**kwargs):
	return DiT2(depth=28,
	hidden_size=1152,
	patch_size=8,
	num_heads=16,
	**kwargs)


	def DiT2_L_2(**kwargs):
	return DiT2(depth=24,
	hidden_size=1024,
	patch_size=2,
	num_heads=16,
	**kwargs)


	def DiT2_L_2_half(**kwargs):
	return DiT2(depth=24 // 2,
	hidden_size=1024,
	patch_size=2,
	num_heads=16,
	**kwargs)


	def DiT2_L_4(**kwargs):
	return DiT2(depth=24,
	hidden_size=1024,
	patch_size=4,
	num_heads=16,
	**kwargs)


	def DiT2_L_8(**kwargs):
	return DiT2(depth=24,
	hidden_size=1024,
	patch_size=8,
	num_heads=16,
	**kwargs)


	def DiT2_B_2(**kwargs):
	return DiT2(depth=12,
	hidden_size=768,
	patch_size=2,
	num_heads=12,
	**kwargs)


	def DiT2_B_4(**kwargs):
	return DiT2(depth=12,
	hidden_size=768,
	patch_size=4,
	num_heads=12,
	**kwargs)


	def DiT2_B_8(**kwargs):
	return DiT2(depth=12,
	hidden_size=768,
	patch_size=8,
	num_heads=12,
	**kwargs)


	def DiT2_B_16(**kwargs): # ours cfg
	return DiT2(depth=12,
	hidden_size=768,
	patch_size=16,
	num_heads=12,
	**kwargs)


	def DiT2_S_2(**kwargs):
	return DiT2(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)


	def DiT2_S_4(**kwargs):
	return DiT2(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)


	def DiT2_S_8(**kwargs):
	return DiT2(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)


	DiT2_models = {
	'DiT2-XL/2': DiT2_XL_2,
	'DiT2-XL/2/half': DiT2_XL_2_half,
	'DiT2-XL/4': DiT2_XL_4,
	'DiT2-XL/8': DiT2_XL_8,
	'DiT2-L/2': DiT2_L_2,
	'DiT2-L/2/half': DiT2_L_2_half,
	'DiT2-L/4': DiT2_L_4,
	'DiT2-L/8': DiT2_L_8,
	'DiT2-B/2': DiT2_B_2,
	'DiT2-B/4': DiT2_B_4,
	'DiT2-B/8': DiT2_B_8,
	'DiT2-B/16': DiT2_B_16,
	'DiT2-S/2': DiT2_S_2,
	'DiT2-S/4': DiT2_S_4,
	'DiT2-S/8': DiT2_S_8,
	}