flux-mini / model.py

Create model.py

54c32af verified about 1 month ago

9.53 kB

	from dataclasses import dataclass
	import numpy as np
	import torch

	from torch import Tensor, nn
	from einops import rearrange

	from layers import (DoubleStreamBlock, EmbedND, LastLayer,
	MLPEmbedder, SingleStreamBlock,
	timestep_embedding)

	import torch.distributed as dist
	from diffusers.models.embeddings import get_1d_sincos_pos_embed_from_grid

	from accelerate.logging import get_logger
	logger = get_logger(__name__, log_level="INFO")




	@dataclass
	class FluxParams:
	in_channels: int
	vec_in_dim: int
	context_in_dim: int
	hidden_size: int
	mlp_ratio: float
	num_heads: int
	depth: int
	depth_single_blocks: int
	axes_dim: list[int]
	theta: int
	qkv_bias: bool
	guidance_embed: bool


	class Flux(nn.Module):
	"""
	Transformer model for flow matching on sequences.
	"""
	_supports_gradient_checkpointing = True

	def __init__(self, params: FluxParams):
	super().__init__()

	self.params = params
	self.in_channels = params.in_channels
	self.out_channels = self.in_channels
	if params.hidden_size % params.num_heads != 0:
	raise ValueError(
	f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
	)
	pe_dim = params.hidden_size // params.num_heads
	if sum(params.axes_dim) != pe_dim:
	raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
	self.hidden_size = params.hidden_size
	self.num_heads = params.num_heads
	self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)

	self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
	self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
	self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
	self.guidance_in = (
	MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
	)
	self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)


	self.double_blocks = nn.ModuleList(
	[
	DoubleStreamBlock(
	self.hidden_size,
	self.num_heads,
	mlp_ratio=params.mlp_ratio,
	qkv_bias=params.qkv_bias
	)
	for i in range(1, params.depth+1)
	]
	)

	self.single_blocks = nn.ModuleList(
	[
	SingleStreamBlock(
	self.hidden_size,
	self.num_heads,
	mlp_ratio=params.mlp_ratio
	)
	for i in range(1, params.depth_single_blocks+1)
	]
	)

	self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
	self.gradient_checkpointing = True

	def _set_gradient_checkpointing(self, module, value=False):
	if hasattr(module, "gradient_checkpointing"):
	module.gradient_checkpointing = value

	@property
	def attn_processors(self):
	# set recursively
	processors = {}

	def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors):
	if hasattr(module, "set_processor"):
	processors[f"{name}.processor"] = module.processor

	for sub_name, child in module.named_children():
	fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)

	return processors

	for name, module in self.named_children():
	fn_recursive_add_processors(name, module, processors)

	return processors

	def set_attn_processor(self, processor):
	r"""
	Sets the attention processor to use to compute attention.

	Parameters:
	processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
	The instantiated processor class or a dictionary of processor classes that will be set as the processor
	for all `Attention` layers.

	If `processor` is a dict, the key needs to define the path to the corresponding cross attention
	processor. This is strongly recommended when setting trainable attention processors.

	"""
	count = len(self.attn_processors.keys())

	if isinstance(processor, dict) and len(processor) != count:
	raise ValueError(
	f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
	f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
	)

	def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
	if hasattr(module, "set_processor"):
	if not isinstance(processor, dict):
	module.set_processor(processor)
	else:
	module.set_processor(processor.pop(f"{name}.processor"))

	for sub_name, child in module.named_children():
	fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)

	for name, module in self.named_children():
	fn_recursive_attn_processor(name, module, processor)

	def forward(
	self,
	img: Tensor,
	img_ids: Tensor,
	txt: Tensor,
	txt_ids: Tensor,
	timesteps: Tensor,
	y: Tensor,
	block_controlnet_hidden_states=None,
	guidance: Tensor = None,
	image_proj: Tensor = None,
	ip_scale: Tensor = 1.0,
	return_intermediate: bool = False,
	):

	if return_intermediate:
	intermediate_double = []
	intermediate_single = []

	# running on sequences img
	img = self.img_in(img)
	vec = self.time_in(timestep_embedding(timesteps, 256))
	if self.params.guidance_embed:
	if guidance is None:
	raise ValueError("Didn't get guidance strength for guidance distilled model.")
	vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
	vec = vec + self.vector_in(y)
	txt = self.txt_in(txt)

	ids = torch.cat((txt_ids, img_ids), dim=1)
	pe = self.pe_embedder(ids)

	if block_controlnet_hidden_states is not None:
	controlnet_depth = len(block_controlnet_hidden_states)


	for index_block, block in enumerate(self.double_blocks):

	if self.training and self.gradient_checkpointing:

	def create_custom_forward(module, return_dict=None):
	def custom_forward(*inputs):
	if return_dict is not None:
	return module(*inputs, return_dict=return_dict)
	else:
	return module(*inputs)

	return custom_forward

	img, txt = torch.utils.checkpoint.checkpoint(
	create_custom_forward(block),
	img,
	txt,
	vec,
	pe,
	image_proj,
	ip_scale,
	use_reentrant=False
	)

	else:
	img, txt = block(
	img=img,
	txt=txt,
	vec=vec,
	pe=pe,
	image_proj=image_proj,
	ip_scale=ip_scale
	)


	if return_intermediate:
	intermediate_double.append(
	[img, txt]
	)

	if block_controlnet_hidden_states is not None:
	img = img + block_controlnet_hidden_states[index_block % 2]

	img = torch.cat((txt, img), dim=1)
	txt_dim = txt.shape[1]
	for index_block, block in enumerate(self.single_blocks):

	if self.training and self.gradient_checkpointing:

	def create_custom_forward(module, return_dict=None):
	def custom_forward(*inputs):
	if return_dict is not None:
	return module(*inputs, return_dict=return_dict)
	else:
	return module(*inputs)

	return custom_forward

	# ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
	img = torch.utils.checkpoint.checkpoint(
	create_custom_forward(block),
	img,
	vec,
	pe,
	use_reentrant=False
	)

	else:
	img = block(img, vec=vec, pe=pe)


	# if return_intermediate:
	img_ = img[:, txt.shape[1]:, ...]
	txt_ = img[:, :txt.shape[1], ...]

	if return_intermediate:
	intermediate_single.append(
	[img_, txt_]
	)

	img = torch.cat([txt_, img_], dim=1)

	img = img[:, txt.shape[1] :, ...]
	img = self.final_layer(img, vec) # (N, T, patch_size ** 2 * out_channels)
	if return_intermediate:
	return img, intermediate_double, intermediate_single
	else:
	return img