渊旷 commited on
Commit
7bfcf76
1 Parent(s): 7ed4dfb
README.md CHANGED
@@ -1,3 +1,32 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: text-to-image
3
+ license: apache-2.0
4
+ tags:
5
+ - Non-Autoregressive
6
+ ---
7
+
8
+ # Meissonic: Revitalizing Masked Generative Transformers for Efficient High-Resolution Text-to-Image Synthesis
9
+
10
+ [Paper](https://arxiv.org/abs/2410.08261) | [Model](https://huggingface.co/MeissonFlow/Meissonic) | [Code](https://github.com/viiika/Meissonic) | [Demo](https://huggingface.co/spaces/MeissonFlow/meissonic)
11
+
12
+
13
+ ![demo](./assets/demos.png)
14
+
15
+
16
+ ## Introduction
17
+ Meissonic is a non-autoregressive mask image modeling text-to-image synthesis model that can generate high-resolution images. It is designed to run on consumer graphics cards.
18
+
19
+ ## Usage
20
+
21
+ Please refer to [github link](https://github.com/viiika/Meissonic).
22
+
23
+ ## Citation
24
+ If you find this work helpful, please consider citing:
25
+ ```bibtex
26
+ @article{bai2024meissonic,
27
+ title={Meissonic: Revitalizing Masked Generative Transformers for Efficient High-Resolution Text-to-Image Synthesis},
28
+ author={Bai, Jinbin and Ye, Tian and Chow, Wei and Song, Enxin and Chen, Qing-Guo and Li, Xiangtai and Dong, Zhen and Zhu, Lei and Yan, Shuicheng},
29
+ journal={arXiv preprint arXiv:2410.08261},
30
+ year={2024}
31
+ }
32
+ ```
model_index.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "Pipeline",
3
+ "_diffusers_version": "0.30.2",
4
+ "scheduler": [
5
+ "scheduler",
6
+ "Scheduler"
7
+ ],
8
+ "text_encoder": [
9
+ "transformers",
10
+ "CLIPTextModelWithProjection"
11
+ ],
12
+ "tokenizer": [
13
+ "transformers",
14
+ "CLIPTokenizer"
15
+ ],
16
+ "transformer": [
17
+ "transformer",
18
+ "Transformer2DModel"
19
+ ],
20
+ "vqvae": [
21
+ "diffusers",
22
+ "VQModel"
23
+ ]
24
+ }
scheduler/scheduler.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The HuggingFace Team and The MeissonFlow Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import math
15
+ from dataclasses import dataclass
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import torch
19
+
20
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
21
+ from diffusers.utils import BaseOutput
22
+ from diffusers.schedulers.scheduling_utils import SchedulerMixin
23
+ import torch.nn.functional as F
24
+
25
+ def gumbel_noise(t, generator=None):
26
+ device = generator.device if generator is not None else t.device
27
+ noise = torch.zeros_like(t, device=device).uniform_(0, 1, generator=generator).to(t.device)
28
+ return -torch.log((-torch.log(noise.clamp(1e-20))).clamp(1e-20))
29
+
30
+
31
+ def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
32
+ confidence = torch.log(probs.clamp(1e-20)) + temperature * gumbel_noise(probs, generator=generator)
33
+ sorted_confidence = torch.sort(confidence, dim=-1).values
34
+ cut_off = torch.gather(sorted_confidence, 1, mask_len.long())
35
+ masking = confidence < cut_off
36
+ return masking
37
+
38
+
39
+ @dataclass
40
+ class SchedulerOutput(BaseOutput):
41
+ """
42
+ Output class for the scheduler's `step` function output.
43
+
44
+ Args:
45
+ prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
46
+ Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
47
+ denoising loop.
48
+ pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
49
+ The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
50
+ `pred_original_sample` can be used to preview progress or for guidance.
51
+ """
52
+
53
+ prev_sample: torch.Tensor
54
+ pred_original_sample: torch.Tensor = None
55
+
56
+
57
+ class Scheduler(SchedulerMixin, ConfigMixin):
58
+ order = 1
59
+
60
+ temperatures: torch.Tensor
61
+
62
+ @register_to_config
63
+ def __init__(
64
+ self,
65
+ mask_token_id: int,
66
+ masking_schedule: str = "cosine",
67
+ ):
68
+ self.temperatures = None
69
+ self.timesteps = None
70
+
71
+ def set_timesteps(
72
+ self,
73
+ num_inference_steps: int,
74
+ temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
75
+ device: Union[str, torch.device] = None,
76
+ ):
77
+ self.timesteps = torch.arange(num_inference_steps, device=device).flip(0)
78
+
79
+ if isinstance(temperature, (tuple, list)):
80
+ self.temperatures = torch.linspace(temperature[0], temperature[1], num_inference_steps, device=device)
81
+ else:
82
+ self.temperatures = torch.linspace(temperature, 0.01, num_inference_steps, device=device)
83
+
84
+
85
+ ### from https://huggingface.co/transformers/v3.2.0/_modules/transformers/generation_utils.html
86
+ def top_k_top_p_filtering(
87
+ self,
88
+ logits,
89
+ top_k: int = 0,
90
+ top_p: float = 1.0,
91
+ filter_value: float = -float("Inf"),
92
+ min_tokens_to_keep: int = 1,
93
+ ):
94
+ """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
95
+ Args:
96
+ logits: logits distribution shape (batch size, vocabulary size)
97
+ if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
98
+ if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
99
+ Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
100
+ Make sure we keep at least min_tokens_to_keep per batch example in the output
101
+ From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
102
+ """
103
+ if top_k > 0:
104
+ top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))
105
+ indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
106
+ logits[indices_to_remove] = filter_value
107
+
108
+ if top_p < 1.0:
109
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
110
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
111
+
112
+
113
+ sorted_indices_to_remove = cumulative_probs > top_p
114
+ if min_tokens_to_keep > 1:
115
+ sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
116
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
117
+ sorted_indices_to_remove[..., 0] = 0
118
+
119
+ indices_to_remove = torch.zeros_like(logits, dtype=torch.bool).scatter_(-1, sorted_indices, sorted_indices_to_remove)
120
+ logits[indices_to_remove] = filter_value
121
+
122
+ return logits
123
+
124
+
125
+ def step(
126
+ self,
127
+ model_output: torch.Tensor,
128
+ timestep: torch.long,
129
+ sample: torch.LongTensor,
130
+ starting_mask_ratio: int = 1,
131
+ generator: Optional[torch.Generator] = None,
132
+ return_dict: bool = True,
133
+ using_topk_topp: Optional[bool] = False,
134
+ sampling_temperature: Optional[float] = 1.0,
135
+ ) -> Union[SchedulerOutput, Tuple]:
136
+ two_dim_input = sample.ndim == 3 and model_output.ndim == 4
137
+
138
+ if two_dim_input:
139
+ batch_size, codebook_size, height, width = model_output.shape
140
+ sample = sample.reshape(batch_size, height * width)
141
+ model_output = model_output.reshape(batch_size, codebook_size, height * width).permute(0, 2, 1)
142
+
143
+ unknown_map = sample == self.config.mask_token_id
144
+
145
+ if using_topk_topp:
146
+ model_output = model_output / max(sampling_temperature, 1e-5)
147
+
148
+ if using_topk_topp:
149
+ top_k=8192
150
+ top_p=0.2
151
+ if top_k > 0 or top_p < 1.0:
152
+ model_output = self.top_k_top_p_filtering(model_output, top_k=top_k, top_p=top_p)
153
+
154
+ probs = model_output.softmax(dim=-1)
155
+
156
+ device = probs.device
157
+ probs_ = probs.to(generator.device) if generator is not None else probs # handles when generator is on CPU
158
+ if probs_.device.type == "cpu" and probs_.dtype != torch.float32:
159
+ probs_ = probs_.float() # multinomial is not implemented for cpu half precision
160
+ probs_ = probs_.reshape(-1, probs.size(-1))
161
+ pred_original_sample = torch.multinomial(probs_, 1, generator=generator).to(device=device)
162
+ pred_original_sample = pred_original_sample[:, 0].view(*probs.shape[:-1])
163
+ pred_original_sample = torch.where(unknown_map, pred_original_sample, sample)
164
+
165
+ if timestep == 0:
166
+ prev_sample = pred_original_sample
167
+ else:
168
+ seq_len = sample.shape[1]
169
+ step_idx = (self.timesteps == timestep).nonzero()
170
+ ratio = (step_idx + 1) / len(self.timesteps)
171
+
172
+ if self.config.masking_schedule == "cosine":
173
+ mask_ratio = torch.cos(ratio * math.pi / 2)
174
+ elif self.config.masking_schedule == "linear":
175
+ mask_ratio = 1 - ratio
176
+ else:
177
+ raise ValueError(f"unknown masking schedule {self.config.masking_schedule}")
178
+
179
+ mask_ratio = starting_mask_ratio * mask_ratio
180
+
181
+ mask_len = (seq_len * mask_ratio).floor()
182
+ # do not mask more than amount previously masked
183
+ mask_len = torch.min(unknown_map.sum(dim=-1, keepdim=True) - 1, mask_len)
184
+ # mask at least one
185
+ mask_len = torch.max(torch.tensor([1], device=model_output.device), mask_len)
186
+
187
+ selected_probs = torch.gather(probs, -1, pred_original_sample[:, :, None])[:, :, 0]
188
+ # Ignores the tokens given in the input by overwriting their confidence.
189
+ selected_probs = torch.where(unknown_map, selected_probs, torch.finfo(selected_probs.dtype).max)
190
+
191
+ masking = mask_by_random_topk(mask_len, selected_probs, self.temperatures[step_idx], generator)
192
+
193
+ # Masks tokens with lower confidence.
194
+ prev_sample = torch.where(masking, self.config.mask_token_id, pred_original_sample)
195
+
196
+ if two_dim_input:
197
+ prev_sample = prev_sample.reshape(batch_size, height, width)
198
+ pred_original_sample = pred_original_sample.reshape(batch_size, height, width)
199
+
200
+ if not return_dict:
201
+ return (prev_sample, pred_original_sample)
202
+
203
+ return SchedulerOutput(prev_sample, pred_original_sample)
204
+
205
+ def add_noise(self, sample, timesteps, generator=None):
206
+ step_idx = (self.timesteps == timesteps).nonzero()
207
+ ratio = (step_idx + 1) / len(self.timesteps)
208
+
209
+ if self.config.masking_schedule == "cosine":
210
+ mask_ratio = torch.cos(ratio * math.pi / 2)
211
+ elif self.config.masking_schedule == "linear":
212
+ mask_ratio = 1 - ratio
213
+ else:
214
+ raise ValueError(f"unknown masking schedule {self.config.masking_schedule}")
215
+
216
+ mask_indices = (
217
+ torch.rand(
218
+ sample.shape, device=generator.device if generator is not None else sample.device, generator=generator
219
+ ).to(sample.device)
220
+ < mask_ratio
221
+ )
222
+
223
+ masked_sample = sample.clone()
224
+
225
+ masked_sample[mask_indices] = self.config.mask_token_id
226
+
227
+ return masked_sample
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "Scheduler",
3
+ "_diffusers_version": "0.30.2",
4
+ "mask_token_id": 8255,
5
+ "masking_schedule": "cosine"
6
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "CLIPTextModelWithProjection"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 0,
7
+ "dropout": 0.0,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_size": 1024,
11
+ "initializer_factor": 1.0,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 4096,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 77,
16
+ "model_type": "clip_text_model",
17
+ "num_attention_heads": 16,
18
+ "num_hidden_layers": 24,
19
+ "pad_token_id": 1,
20
+ "projection_dim": 1024,
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.44.2",
23
+ "vocab_size": 49408
24
+ }
text_encoder/model.fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42a6a63bcfcb0d7cc9e2a687134ceb7cb83d0346285636ec8547e7ffa2bcd224
3
+ size 708111984
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f32c52903fc74d29d0ea3f0ceea8080eec7ad4b2913e16555a4e546df0f37c7f
3
+ size 1416177568
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "!",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "!",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "49406": {
13
+ "content": "<|startoftext|>",
14
+ "lstrip": false,
15
+ "normalized": true,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "49407": {
21
+ "content": "<|endoftext|>",
22
+ "lstrip": false,
23
+ "normalized": true,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "bos_token": "<|startoftext|>",
30
+ "clean_up_tokenization_spaces": true,
31
+ "do_lower_case": true,
32
+ "eos_token": "<|endoftext|>",
33
+ "errors": "replace",
34
+ "model_max_length": 77,
35
+ "pad_token": "!",
36
+ "tokenizer_class": "CLIPTokenizer",
37
+ "unk_token": "<|endoftext|>"
38
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
transformer/config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "Transformer2DModel",
3
+ "_diffusers_version": "0.30.2",
4
+ "attention_head_dim": 128,
5
+ "axes_dims_rope": [
6
+ 16,
7
+ 56,
8
+ 56
9
+ ],
10
+ "codebook_size": 8192,
11
+ "downsample": true,
12
+ "guidance_embeds": false,
13
+ "in_channels": 64,
14
+ "joint_attention_dim": 1024,
15
+ "num_attention_heads": 8,
16
+ "num_layers": 14,
17
+ "num_single_layers": 28,
18
+ "patch_size": 1,
19
+ "pooled_projection_dim": 1024,
20
+ "upsample": true,
21
+ "vocab_size": 8256
22
+ }
transformer/diffusion_pytorch_model.fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1f55cdc9d2f78d50840e1f52dbd407e86e12c9d209f59c79629900b23ce70e1
3
+ size 2013993248
transformer/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a44b1775411bea393b930ad6524d536ef316910e6a898c9400394e21c7fe632f
3
+ size 4027886416
transformer/transformer.py ADDED
@@ -0,0 +1,1215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Black Forest Labs, The HuggingFace Team, The InstantX Team and The MeissonFlow Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import Any, Dict, Optional, Tuple, Union
17
+
18
+ import numpy as np
19
+ import torch
20
+ import torch.nn as nn
21
+ import torch.nn.functional as F
22
+
23
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
24
+ from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
25
+ from diffusers.models.attention import FeedForward, BasicTransformerBlock, SkipFFTransformerBlock
26
+ from diffusers.models.attention_processor import (
27
+ Attention,
28
+ AttentionProcessor,
29
+ FluxAttnProcessor2_0,
30
+ # FusedFluxAttnProcessor2_0,
31
+ )
32
+ from diffusers.models.modeling_utils import ModelMixin
33
+ from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle, GlobalResponseNorm, RMSNorm
34
+ from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
35
+ from diffusers.utils.torch_utils import maybe_allow_in_graph
36
+ from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings,TimestepEmbedding, get_timestep_embedding #,FluxPosEmbed
37
+ from diffusers.models.modeling_outputs import Transformer2DModelOutput
38
+ from diffusers.models.resnet import Downsample2D, Upsample2D
39
+
40
+ from typing import List
41
+
42
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
43
+
44
+
45
+
46
+ def get_3d_rotary_pos_embed(
47
+ embed_dim, crops_coords, grid_size, temporal_size, theta: int = 10000, use_real: bool = True
48
+ ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
49
+ """
50
+ RoPE for video tokens with 3D structure.
51
+
52
+ Args:
53
+ embed_dim: (`int`):
54
+ The embedding dimension size, corresponding to hidden_size_head.
55
+ crops_coords (`Tuple[int]`):
56
+ The top-left and bottom-right coordinates of the crop.
57
+ grid_size (`Tuple[int]`):
58
+ The grid size of the spatial positional embedding (height, width).
59
+ temporal_size (`int`):
60
+ The size of the temporal dimension.
61
+ theta (`float`):
62
+ Scaling factor for frequency computation.
63
+ use_real (`bool`):
64
+ If True, return real part and imaginary part separately. Otherwise, return complex numbers.
65
+
66
+ Returns:
67
+ `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
68
+ """
69
+ start, stop = crops_coords
70
+ grid_h = np.linspace(start[0], stop[0], grid_size[0], endpoint=False, dtype=np.float32)
71
+ grid_w = np.linspace(start[1], stop[1], grid_size[1], endpoint=False, dtype=np.float32)
72
+ grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32)
73
+
74
+ # Compute dimensions for each axis
75
+ dim_t = embed_dim // 4
76
+ dim_h = embed_dim // 8 * 3
77
+ dim_w = embed_dim // 8 * 3
78
+
79
+ # Temporal frequencies
80
+ freqs_t = 1.0 / (theta ** (torch.arange(0, dim_t, 2).float() / dim_t))
81
+ grid_t = torch.from_numpy(grid_t).float()
82
+ freqs_t = torch.einsum("n , f -> n f", grid_t, freqs_t)
83
+ freqs_t = freqs_t.repeat_interleave(2, dim=-1)
84
+
85
+ # Spatial frequencies for height and width
86
+ freqs_h = 1.0 / (theta ** (torch.arange(0, dim_h, 2).float() / dim_h))
87
+ freqs_w = 1.0 / (theta ** (torch.arange(0, dim_w, 2).float() / dim_w))
88
+ grid_h = torch.from_numpy(grid_h).float()
89
+ grid_w = torch.from_numpy(grid_w).float()
90
+ freqs_h = torch.einsum("n , f -> n f", grid_h, freqs_h)
91
+ freqs_w = torch.einsum("n , f -> n f", grid_w, freqs_w)
92
+ freqs_h = freqs_h.repeat_interleave(2, dim=-1)
93
+ freqs_w = freqs_w.repeat_interleave(2, dim=-1)
94
+
95
+ # Broadcast and concatenate tensors along specified dimension
96
+ def broadcast(tensors, dim=-1):
97
+ num_tensors = len(tensors)
98
+ shape_lens = {len(t.shape) for t in tensors}
99
+ assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
100
+ shape_len = list(shape_lens)[0]
101
+ dim = (dim + shape_len) if dim < 0 else dim
102
+ dims = list(zip(*(list(t.shape) for t in tensors)))
103
+ expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
104
+ assert all(
105
+ [*(len(set(t[1])) <= 2 for t in expandable_dims)]
106
+ ), "invalid dimensions for broadcastable concatenation"
107
+ max_dims = [(t[0], max(t[1])) for t in expandable_dims]
108
+ expanded_dims = [(t[0], (t[1],) * num_tensors) for t in max_dims]
109
+ expanded_dims.insert(dim, (dim, dims[dim]))
110
+ expandable_shapes = list(zip(*(t[1] for t in expanded_dims)))
111
+ tensors = [t[0].expand(*t[1]) for t in zip(tensors, expandable_shapes)]
112
+ return torch.cat(tensors, dim=dim)
113
+
114
+ freqs = broadcast((freqs_t[:, None, None, :], freqs_h[None, :, None, :], freqs_w[None, None, :, :]), dim=-1)
115
+
116
+ t, h, w, d = freqs.shape
117
+ freqs = freqs.view(t * h * w, d)
118
+
119
+ # Generate sine and cosine components
120
+ sin = freqs.sin()
121
+ cos = freqs.cos()
122
+
123
+ if use_real:
124
+ return cos, sin
125
+ else:
126
+ freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
127
+ return freqs_cis
128
+
129
+
130
+ def get_2d_rotary_pos_embed(embed_dim, crops_coords, grid_size, use_real=True):
131
+ """
132
+ RoPE for image tokens with 2d structure.
133
+
134
+ Args:
135
+ embed_dim: (`int`):
136
+ The embedding dimension size
137
+ crops_coords (`Tuple[int]`)
138
+ The top-left and bottom-right coordinates of the crop.
139
+ grid_size (`Tuple[int]`):
140
+ The grid size of the positional embedding.
141
+ use_real (`bool`):
142
+ If True, return real part and imaginary part separately. Otherwise, return complex numbers.
143
+
144
+ Returns:
145
+ `torch.Tensor`: positional embedding with shape `( grid_size * grid_size, embed_dim/2)`.
146
+ """
147
+ start, stop = crops_coords
148
+ grid_h = np.linspace(start[0], stop[0], grid_size[0], endpoint=False, dtype=np.float32)
149
+ grid_w = np.linspace(start[1], stop[1], grid_size[1], endpoint=False, dtype=np.float32)
150
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
151
+ grid = np.stack(grid, axis=0) # [2, W, H]
152
+
153
+ grid = grid.reshape([2, 1, *grid.shape[1:]])
154
+ pos_embed = get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=use_real)
155
+ return pos_embed
156
+
157
+
158
+ def get_2d_rotary_pos_embed_from_grid(embed_dim, grid, use_real=False):
159
+ assert embed_dim % 4 == 0
160
+
161
+ # use half of dimensions to encode grid_h
162
+ emb_h = get_1d_rotary_pos_embed(
163
+ embed_dim // 2, grid[0].reshape(-1), use_real=use_real
164
+ ) # (H*W, D/2) if use_real else (H*W, D/4)
165
+ emb_w = get_1d_rotary_pos_embed(
166
+ embed_dim // 2, grid[1].reshape(-1), use_real=use_real
167
+ ) # (H*W, D/2) if use_real else (H*W, D/4)
168
+
169
+ if use_real:
170
+ cos = torch.cat([emb_h[0], emb_w[0]], dim=1) # (H*W, D)
171
+ sin = torch.cat([emb_h[1], emb_w[1]], dim=1) # (H*W, D)
172
+ return cos, sin
173
+ else:
174
+ emb = torch.cat([emb_h, emb_w], dim=1) # (H*W, D/2)
175
+ return emb
176
+
177
+
178
+ def get_2d_rotary_pos_embed_lumina(embed_dim, len_h, len_w, linear_factor=1.0, ntk_factor=1.0):
179
+ assert embed_dim % 4 == 0
180
+
181
+ emb_h = get_1d_rotary_pos_embed(
182
+ embed_dim // 2, len_h, linear_factor=linear_factor, ntk_factor=ntk_factor
183
+ ) # (H, D/4)
184
+ emb_w = get_1d_rotary_pos_embed(
185
+ embed_dim // 2, len_w, linear_factor=linear_factor, ntk_factor=ntk_factor
186
+ ) # (W, D/4)
187
+ emb_h = emb_h.view(len_h, 1, embed_dim // 4, 1).repeat(1, len_w, 1, 1) # (H, W, D/4, 1)
188
+ emb_w = emb_w.view(1, len_w, embed_dim // 4, 1).repeat(len_h, 1, 1, 1) # (H, W, D/4, 1)
189
+
190
+ emb = torch.cat([emb_h, emb_w], dim=-1).flatten(2) # (H, W, D/2)
191
+ return emb
192
+
193
+
194
+ def get_1d_rotary_pos_embed(
195
+ dim: int,
196
+ pos: Union[np.ndarray, int],
197
+ theta: float = 10000.0,
198
+ use_real=False,
199
+ linear_factor=1.0,
200
+ ntk_factor=1.0,
201
+ repeat_interleave_real=True,
202
+ freqs_dtype=torch.float32, # torch.float32 (hunyuan, stable audio), torch.float64 (flux)
203
+ ):
204
+ """
205
+ Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
206
+
207
+ This function calculates a frequency tensor with complex exponentials using the given dimension 'dim' and the end
208
+ index 'end'. The 'theta' parameter scales the frequencies. The returned tensor contains complex values in complex64
209
+ data type.
210
+
211
+ Args:
212
+ dim (`int`): Dimension of the frequency tensor.
213
+ pos (`np.ndarray` or `int`): Position indices for the frequency tensor. [S] or scalar
214
+ theta (`float`, *optional*, defaults to 10000.0):
215
+ Scaling factor for frequency computation. Defaults to 10000.0.
216
+ use_real (`bool`, *optional*):
217
+ If True, return real part and imaginary part separately. Otherwise, return complex numbers.
218
+ linear_factor (`float`, *optional*, defaults to 1.0):
219
+ Scaling factor for the context extrapolation. Defaults to 1.0.
220
+ ntk_factor (`float`, *optional*, defaults to 1.0):
221
+ Scaling factor for the NTK-Aware RoPE. Defaults to 1.0.
222
+ repeat_interleave_real (`bool`, *optional*, defaults to `True`):
223
+ If `True` and `use_real`, real part and imaginary part are each interleaved with themselves to reach `dim`.
224
+ Otherwise, they are concateanted with themselves.
225
+ freqs_dtype (`torch.float32` or `torch.float64`, *optional*, defaults to `torch.float32`):
226
+ the dtype of the frequency tensor.
227
+ Returns:
228
+ `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
229
+ """
230
+ assert dim % 2 == 0
231
+
232
+ if isinstance(pos, int):
233
+ pos = np.arange(pos)
234
+ theta = theta * ntk_factor
235
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=freqs_dtype)[: (dim // 2)] / dim)) / linear_factor # [D/2]
236
+ t = torch.from_numpy(pos).to(freqs.device) # type: ignore # [S]
237
+ freqs = torch.outer(t, freqs) # type: ignore # [S, D/2]
238
+ if use_real and repeat_interleave_real:
239
+ freqs_cos = freqs.cos().repeat_interleave(2, dim=1).float() # [S, D]
240
+ freqs_sin = freqs.sin().repeat_interleave(2, dim=1).float() # [S, D]
241
+ return freqs_cos, freqs_sin
242
+ elif use_real:
243
+ freqs_cos = torch.cat([freqs.cos(), freqs.cos()], dim=-1).float() # [S, D]
244
+ freqs_sin = torch.cat([freqs.sin(), freqs.sin()], dim=-1).float() # [S, D]
245
+ return freqs_cos, freqs_sin
246
+ else:
247
+ freqs_cis = torch.polar(torch.ones_like(freqs), freqs).float() # complex64 # [S, D/2]
248
+ return freqs_cis
249
+
250
+
251
+ class FluxPosEmbed(nn.Module):
252
+ # modified from https://github.com/black-forest-labs/flux/blob/c00d7c60b085fce8058b9df845e036090873f2ce/src/flux/modules/layers.py#L11
253
+ def __init__(self, theta: int, axes_dim: List[int]):
254
+ super().__init__()
255
+ self.theta = theta
256
+ self.axes_dim = axes_dim
257
+
258
+ def forward(self, ids: torch.Tensor) -> torch.Tensor:
259
+ n_axes = ids.shape[-1]
260
+ cos_out = []
261
+ sin_out = []
262
+ pos = ids.squeeze().float().cpu().numpy()
263
+ is_mps = ids.device.type == "mps"
264
+ freqs_dtype = torch.float32 if is_mps else torch.float64
265
+ for i in range(n_axes):
266
+ cos, sin = get_1d_rotary_pos_embed(
267
+ self.axes_dim[i], pos[:, i], repeat_interleave_real=True, use_real=True, freqs_dtype=freqs_dtype
268
+ )
269
+ cos_out.append(cos)
270
+ sin_out.append(sin)
271
+ freqs_cos = torch.cat(cos_out, dim=-1).to(ids.device)
272
+ freqs_sin = torch.cat(sin_out, dim=-1).to(ids.device)
273
+ return freqs_cos, freqs_sin
274
+
275
+
276
+
277
+ class FusedFluxAttnProcessor2_0:
278
+ """Attention processor used typically in processing the SD3-like self-attention projections."""
279
+
280
+ def __init__(self):
281
+ if not hasattr(F, "scaled_dot_product_attention"):
282
+ raise ImportError(
283
+ "FusedFluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
284
+ )
285
+
286
+ def __call__(
287
+ self,
288
+ attn: Attention,
289
+ hidden_states: torch.FloatTensor,
290
+ encoder_hidden_states: torch.FloatTensor = None,
291
+ attention_mask: Optional[torch.FloatTensor] = None,
292
+ image_rotary_emb: Optional[torch.Tensor] = None,
293
+ ) -> torch.FloatTensor:
294
+ batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
295
+
296
+ # `sample` projections.
297
+ qkv = attn.to_qkv(hidden_states)
298
+ split_size = qkv.shape[-1] // 3
299
+ query, key, value = torch.split(qkv, split_size, dim=-1)
300
+
301
+ inner_dim = key.shape[-1]
302
+ head_dim = inner_dim // attn.heads
303
+
304
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
305
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
306
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
307
+
308
+ if attn.norm_q is not None:
309
+ query = attn.norm_q(query)
310
+ if attn.norm_k is not None:
311
+ key = attn.norm_k(key)
312
+
313
+ # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
314
+ # `context` projections.
315
+ if encoder_hidden_states is not None:
316
+ encoder_qkv = attn.to_added_qkv(encoder_hidden_states)
317
+ split_size = encoder_qkv.shape[-1] // 3
318
+ (
319
+ encoder_hidden_states_query_proj,
320
+ encoder_hidden_states_key_proj,
321
+ encoder_hidden_states_value_proj,
322
+ ) = torch.split(encoder_qkv, split_size, dim=-1)
323
+
324
+ encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
325
+ batch_size, -1, attn.heads, head_dim
326
+ ).transpose(1, 2)
327
+ encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
328
+ batch_size, -1, attn.heads, head_dim
329
+ ).transpose(1, 2)
330
+ encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
331
+ batch_size, -1, attn.heads, head_dim
332
+ ).transpose(1, 2)
333
+
334
+ if attn.norm_added_q is not None:
335
+ encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
336
+ if attn.norm_added_k is not None:
337
+ encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
338
+
339
+ # attention
340
+ query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
341
+ key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
342
+ value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
343
+
344
+ if image_rotary_emb is not None:
345
+ from .embeddings import apply_rotary_emb
346
+
347
+ query = apply_rotary_emb(query, image_rotary_emb)
348
+ key = apply_rotary_emb(key, image_rotary_emb)
349
+
350
+ hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False)
351
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
352
+ hidden_states = hidden_states.to(query.dtype)
353
+
354
+ if encoder_hidden_states is not None:
355
+ encoder_hidden_states, hidden_states = (
356
+ hidden_states[:, : encoder_hidden_states.shape[1]],
357
+ hidden_states[:, encoder_hidden_states.shape[1] :],
358
+ )
359
+
360
+ # linear proj
361
+ hidden_states = attn.to_out[0](hidden_states)
362
+ # dropout
363
+ hidden_states = attn.to_out[1](hidden_states)
364
+ encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
365
+
366
+ return hidden_states, encoder_hidden_states
367
+ else:
368
+ return hidden_states
369
+
370
+
371
+
372
+ @maybe_allow_in_graph
373
+ class SingleTransformerBlock(nn.Module):
374
+ r"""
375
+ A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
376
+
377
+ Reference: https://arxiv.org/abs/2403.03206
378
+
379
+ Parameters:
380
+ dim (`int`): The number of channels in the input and output.
381
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
382
+ attention_head_dim (`int`): The number of channels in each head.
383
+ context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
384
+ processing of `context` conditions.
385
+ """
386
+
387
+ def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
388
+ super().__init__()
389
+ self.mlp_hidden_dim = int(dim * mlp_ratio)
390
+
391
+ self.norm = AdaLayerNormZeroSingle(dim)
392
+ self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
393
+ self.act_mlp = nn.GELU(approximate="tanh")
394
+ self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
395
+
396
+ processor = FluxAttnProcessor2_0()
397
+ self.attn = Attention(
398
+ query_dim=dim,
399
+ cross_attention_dim=None,
400
+ dim_head=attention_head_dim,
401
+ heads=num_attention_heads,
402
+ out_dim=dim,
403
+ bias=True,
404
+ processor=processor,
405
+ qk_norm="rms_norm",
406
+ eps=1e-6,
407
+ pre_only=True,
408
+ )
409
+
410
+ def forward(
411
+ self,
412
+ hidden_states: torch.FloatTensor,
413
+ temb: torch.FloatTensor,
414
+ image_rotary_emb=None,
415
+ ):
416
+ residual = hidden_states
417
+ norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
418
+ mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
419
+
420
+ attn_output = self.attn(
421
+ hidden_states=norm_hidden_states,
422
+ image_rotary_emb=image_rotary_emb,
423
+ )
424
+
425
+ hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
426
+ gate = gate.unsqueeze(1)
427
+ hidden_states = gate * self.proj_out(hidden_states)
428
+ hidden_states = residual + hidden_states
429
+ if hidden_states.dtype == torch.float16:
430
+ hidden_states = hidden_states.clip(-65504, 65504)
431
+
432
+ return hidden_states
433
+
434
+ @maybe_allow_in_graph
435
+ class TransformerBlock(nn.Module):
436
+ r"""
437
+ A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
438
+
439
+ Reference: https://arxiv.org/abs/2403.03206
440
+
441
+ Parameters:
442
+ dim (`int`): The number of channels in the input and output.
443
+ num_attention_heads (`int`): The number of heads to use for multi-head attention.
444
+ attention_head_dim (`int`): The number of channels in each head.
445
+ context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
446
+ processing of `context` conditions.
447
+ """
448
+
449
+ def __init__(self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6):
450
+ super().__init__()
451
+
452
+ self.norm1 = AdaLayerNormZero(dim)
453
+
454
+ self.norm1_context = AdaLayerNormZero(dim)
455
+
456
+ if hasattr(F, "scaled_dot_product_attention"):
457
+ processor = FluxAttnProcessor2_0()
458
+ else:
459
+ raise ValueError(
460
+ "The current PyTorch version does not support the `scaled_dot_product_attention` function."
461
+ )
462
+ self.attn = Attention(
463
+ query_dim=dim,
464
+ cross_attention_dim=None,
465
+ added_kv_proj_dim=dim,
466
+ dim_head=attention_head_dim,
467
+ heads=num_attention_heads,
468
+ out_dim=dim,
469
+ context_pre_only=False,
470
+ bias=True,
471
+ processor=processor,
472
+ qk_norm=qk_norm,
473
+ eps=eps,
474
+ )
475
+
476
+ self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
477
+ self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
478
+ # self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="swiglu")
479
+
480
+ self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
481
+ self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
482
+ # self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="swiglu")
483
+
484
+ # let chunk size default to None
485
+ self._chunk_size = None
486
+ self._chunk_dim = 0
487
+
488
+ def forward(
489
+ self,
490
+ hidden_states: torch.FloatTensor,
491
+ encoder_hidden_states: torch.FloatTensor,
492
+ temb: torch.FloatTensor,
493
+ image_rotary_emb=None,
494
+ ):
495
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
496
+
497
+ norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
498
+ encoder_hidden_states, emb=temb
499
+ )
500
+ # Attention.
501
+ attn_output, context_attn_output = self.attn(
502
+ hidden_states=norm_hidden_states,
503
+ encoder_hidden_states=norm_encoder_hidden_states,
504
+ image_rotary_emb=image_rotary_emb,
505
+ )
506
+
507
+ # Process attention outputs for the `hidden_states`.
508
+ attn_output = gate_msa.unsqueeze(1) * attn_output
509
+ hidden_states = hidden_states + attn_output
510
+
511
+ norm_hidden_states = self.norm2(hidden_states)
512
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
513
+
514
+ ff_output = self.ff(norm_hidden_states)
515
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
516
+
517
+ hidden_states = hidden_states + ff_output
518
+
519
+ # Process attention outputs for the `encoder_hidden_states`.
520
+
521
+ context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
522
+ encoder_hidden_states = encoder_hidden_states + context_attn_output
523
+
524
+ norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
525
+ norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
526
+
527
+ context_ff_output = self.ff_context(norm_encoder_hidden_states)
528
+ encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
529
+ if encoder_hidden_states.dtype == torch.float16:
530
+ encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
531
+
532
+ return encoder_hidden_states, hidden_states
533
+
534
+
535
+ class UVit2DConvEmbed(nn.Module):
536
+ def __init__(self, in_channels, block_out_channels, vocab_size, elementwise_affine, eps, bias):
537
+ super().__init__()
538
+ self.embeddings = nn.Embedding(vocab_size, in_channels)
539
+ self.layer_norm = RMSNorm(in_channels, eps, elementwise_affine)
540
+ self.conv = nn.Conv2d(in_channels, block_out_channels, kernel_size=1, bias=bias)
541
+
542
+ def forward(self, input_ids):
543
+ embeddings = self.embeddings(input_ids)
544
+ embeddings = self.layer_norm(embeddings)
545
+ embeddings = embeddings.permute(0, 3, 1, 2)
546
+ embeddings = self.conv(embeddings)
547
+ return embeddings
548
+
549
+ class ConvMlmLayer(nn.Module):
550
+ def __init__(
551
+ self,
552
+ block_out_channels: int,
553
+ in_channels: int,
554
+ use_bias: bool,
555
+ ln_elementwise_affine: bool,
556
+ layer_norm_eps: float,
557
+ codebook_size: int,
558
+ ):
559
+ super().__init__()
560
+ self.conv1 = nn.Conv2d(block_out_channels, in_channels, kernel_size=1, bias=use_bias)
561
+ self.layer_norm = RMSNorm(in_channels, layer_norm_eps, ln_elementwise_affine)
562
+ self.conv2 = nn.Conv2d(in_channels, codebook_size, kernel_size=1, bias=use_bias)
563
+
564
+ def forward(self, hidden_states):
565
+ hidden_states = self.conv1(hidden_states)
566
+ hidden_states = self.layer_norm(hidden_states.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
567
+ logits = self.conv2(hidden_states)
568
+ return logits
569
+
570
+ class SwiGLU(nn.Module):
571
+ r"""
572
+ A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. It's similar to `GEGLU`
573
+ but uses SiLU / Swish instead of GeLU.
574
+
575
+ Parameters:
576
+ dim_in (`int`): The number of channels in the input.
577
+ dim_out (`int`): The number of channels in the output.
578
+ bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
579
+ """
580
+
581
+ def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
582
+ super().__init__()
583
+ self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
584
+ self.activation = nn.SiLU()
585
+
586
+ def forward(self, hidden_states):
587
+ hidden_states = self.proj(hidden_states)
588
+ hidden_states, gate = hidden_states.chunk(2, dim=-1)
589
+ return hidden_states * self.activation(gate)
590
+
591
+ class ConvNextBlock(nn.Module):
592
+ def __init__(
593
+ self, channels, layer_norm_eps, ln_elementwise_affine, use_bias, hidden_dropout, hidden_size, res_ffn_factor=4
594
+ ):
595
+ super().__init__()
596
+ self.depthwise = nn.Conv2d(
597
+ channels,
598
+ channels,
599
+ kernel_size=3,
600
+ padding=1,
601
+ groups=channels,
602
+ bias=use_bias,
603
+ )
604
+ self.norm = RMSNorm(channels, layer_norm_eps, ln_elementwise_affine)
605
+ self.channelwise_linear_1 = nn.Linear(channels, int(channels * res_ffn_factor), bias=use_bias)
606
+ self.channelwise_act = nn.GELU()
607
+ self.channelwise_norm = GlobalResponseNorm(int(channels * res_ffn_factor))
608
+ self.channelwise_linear_2 = nn.Linear(int(channels * res_ffn_factor), channels, bias=use_bias)
609
+ self.channelwise_dropout = nn.Dropout(hidden_dropout)
610
+ self.cond_embeds_mapper = nn.Linear(hidden_size, channels * 2, use_bias)
611
+
612
+ def forward(self, x, cond_embeds):
613
+ x_res = x
614
+
615
+ x = self.depthwise(x)
616
+
617
+ x = x.permute(0, 2, 3, 1)
618
+ x = self.norm(x)
619
+
620
+ x = self.channelwise_linear_1(x)
621
+ x = self.channelwise_act(x)
622
+ x = self.channelwise_norm(x)
623
+ x = self.channelwise_linear_2(x)
624
+ x = self.channelwise_dropout(x)
625
+
626
+ x = x.permute(0, 3, 1, 2)
627
+
628
+ x = x + x_res
629
+
630
+ scale, shift = self.cond_embeds_mapper(F.silu(cond_embeds)).chunk(2, dim=1)
631
+ x = x * (1 + scale[:, :, None, None]) + shift[:, :, None, None]
632
+
633
+ return x
634
+
635
+ class Simple_UVitBlock(nn.Module):
636
+ def __init__(
637
+ self,
638
+ channels,
639
+ ln_elementwise_affine,
640
+ layer_norm_eps,
641
+ use_bias,
642
+ downsample: bool,
643
+ upsample: bool,
644
+ ):
645
+ super().__init__()
646
+
647
+ if downsample:
648
+ self.downsample = Downsample2D(
649
+ channels,
650
+ use_conv=True,
651
+ padding=0,
652
+ name="Conv2d_0",
653
+ kernel_size=2,
654
+ norm_type="rms_norm",
655
+ eps=layer_norm_eps,
656
+ elementwise_affine=ln_elementwise_affine,
657
+ bias=use_bias,
658
+ )
659
+ else:
660
+ self.downsample = None
661
+
662
+ if upsample:
663
+ self.upsample = Upsample2D(
664
+ channels,
665
+ use_conv_transpose=True,
666
+ kernel_size=2,
667
+ padding=0,
668
+ name="conv",
669
+ norm_type="rms_norm",
670
+ eps=layer_norm_eps,
671
+ elementwise_affine=ln_elementwise_affine,
672
+ bias=use_bias,
673
+ interpolate=False,
674
+ )
675
+ else:
676
+ self.upsample = None
677
+
678
+ def forward(self, x):
679
+ # print("before,", x.shape)
680
+ if self.downsample is not None:
681
+ # print('downsample')
682
+ x = self.downsample(x)
683
+
684
+ if self.upsample is not None:
685
+ # print('upsample')
686
+ x = self.upsample(x)
687
+ # print("after,", x.shape)
688
+ return x
689
+
690
+
691
+ class UVitBlock(nn.Module):
692
+ def __init__(
693
+ self,
694
+ channels,
695
+ num_res_blocks: int,
696
+ hidden_size,
697
+ hidden_dropout,
698
+ ln_elementwise_affine,
699
+ layer_norm_eps,
700
+ use_bias,
701
+ block_num_heads,
702
+ attention_dropout,
703
+ downsample: bool,
704
+ upsample: bool,
705
+ ):
706
+ super().__init__()
707
+
708
+ if downsample:
709
+ self.downsample = Downsample2D(
710
+ channels,
711
+ use_conv=True,
712
+ padding=0,
713
+ name="Conv2d_0",
714
+ kernel_size=2,
715
+ norm_type="rms_norm",
716
+ eps=layer_norm_eps,
717
+ elementwise_affine=ln_elementwise_affine,
718
+ bias=use_bias,
719
+ )
720
+ else:
721
+ self.downsample = None
722
+
723
+ self.res_blocks = nn.ModuleList(
724
+ [
725
+ ConvNextBlock(
726
+ channels,
727
+ layer_norm_eps,
728
+ ln_elementwise_affine,
729
+ use_bias,
730
+ hidden_dropout,
731
+ hidden_size,
732
+ )
733
+ for i in range(num_res_blocks)
734
+ ]
735
+ )
736
+
737
+ self.attention_blocks = nn.ModuleList(
738
+ [
739
+ SkipFFTransformerBlock(
740
+ channels,
741
+ block_num_heads,
742
+ channels // block_num_heads,
743
+ hidden_size,
744
+ use_bias,
745
+ attention_dropout,
746
+ channels,
747
+ attention_bias=use_bias,
748
+ attention_out_bias=use_bias,
749
+ )
750
+ for _ in range(num_res_blocks)
751
+ ]
752
+ )
753
+
754
+ if upsample:
755
+ self.upsample = Upsample2D(
756
+ channels,
757
+ use_conv_transpose=True,
758
+ kernel_size=2,
759
+ padding=0,
760
+ name="conv",
761
+ norm_type="rms_norm",
762
+ eps=layer_norm_eps,
763
+ elementwise_affine=ln_elementwise_affine,
764
+ bias=use_bias,
765
+ interpolate=False,
766
+ )
767
+ else:
768
+ self.upsample = None
769
+
770
+ def forward(self, x, pooled_text_emb, encoder_hidden_states, cross_attention_kwargs):
771
+ if self.downsample is not None:
772
+ x = self.downsample(x)
773
+
774
+ for res_block, attention_block in zip(self.res_blocks, self.attention_blocks):
775
+ x = res_block(x, pooled_text_emb)
776
+
777
+ batch_size, channels, height, width = x.shape
778
+ x = x.view(batch_size, channels, height * width).permute(0, 2, 1)
779
+ x = attention_block(
780
+ x, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs
781
+ )
782
+ x = x.permute(0, 2, 1).view(batch_size, channels, height, width)
783
+
784
+ if self.upsample is not None:
785
+ x = self.upsample(x)
786
+
787
+ return x
788
+
789
+ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
790
+ """
791
+ The Transformer model introduced in Flux.
792
+
793
+ Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
794
+
795
+ Parameters:
796
+ patch_size (`int`): Patch size to turn the input data into small patches.
797
+ in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
798
+ num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
799
+ num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
800
+ attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
801
+ num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
802
+ joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
803
+ pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
804
+ guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
805
+ """
806
+
807
+ _supports_gradient_checkpointing = False #True
808
+ # Due to NotImplementedError: DDPOptimizer backend: Found a higher order op in the graph. This is not supported. Please turn off DDP optimizer using torch._dynamo.config.optimize_ddp=False. Note that this can cause performance degradation because there will be one bucket for the entire Dynamo graph.
809
+ # Please refer to this issue - https://github.com/pytorch/pytorch/issues/104674.
810
+ _no_split_modules = ["TransformerBlock", "SingleTransformerBlock"]
811
+
812
+ @register_to_config
813
+ def __init__(
814
+ self,
815
+ patch_size: int = 1,
816
+ in_channels: int = 64,
817
+ num_layers: int = 19,
818
+ num_single_layers: int = 38,
819
+ attention_head_dim: int = 128,
820
+ num_attention_heads: int = 24,
821
+ joint_attention_dim: int = 4096,
822
+ pooled_projection_dim: int = 768,
823
+ guidance_embeds: bool = False, # unused in our implementation
824
+ axes_dims_rope: Tuple[int] = (16, 56, 56),
825
+ vocab_size: int = 8256,
826
+ codebook_size: int = 8192,
827
+ downsample: bool = False,
828
+ upsample: bool = False,
829
+ ):
830
+ super().__init__()
831
+ self.out_channels = in_channels
832
+ self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
833
+
834
+ self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
835
+ text_time_guidance_cls = (
836
+ CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
837
+ )
838
+ self.time_text_embed = text_time_guidance_cls(
839
+ embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
840
+ )
841
+
842
+ self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
843
+
844
+ self.transformer_blocks = nn.ModuleList(
845
+ [
846
+ TransformerBlock(
847
+ dim=self.inner_dim,
848
+ num_attention_heads=self.config.num_attention_heads,
849
+ attention_head_dim=self.config.attention_head_dim,
850
+ )
851
+ for i in range(self.config.num_layers)
852
+ ]
853
+ )
854
+
855
+ self.single_transformer_blocks = nn.ModuleList(
856
+ [
857
+ SingleTransformerBlock(
858
+ dim=self.inner_dim,
859
+ num_attention_heads=self.config.num_attention_heads,
860
+ attention_head_dim=self.config.attention_head_dim,
861
+ )
862
+ for i in range(self.config.num_single_layers)
863
+ ]
864
+ )
865
+
866
+
867
+ self.gradient_checkpointing = False
868
+
869
+ in_channels_embed = self.inner_dim
870
+ ln_elementwise_affine = True
871
+ layer_norm_eps = 1e-06
872
+ use_bias = False
873
+ micro_cond_embed_dim = 1280
874
+ self.embed = UVit2DConvEmbed(
875
+ in_channels_embed, self.inner_dim, self.config.vocab_size, ln_elementwise_affine, layer_norm_eps, use_bias
876
+ )
877
+ self.mlm_layer = ConvMlmLayer(
878
+ self.inner_dim, in_channels_embed, use_bias, ln_elementwise_affine, layer_norm_eps, self.config.codebook_size
879
+ )
880
+ self.cond_embed = TimestepEmbedding(
881
+ micro_cond_embed_dim + self.config.pooled_projection_dim, self.inner_dim, sample_proj_bias=use_bias
882
+ )
883
+ self.encoder_proj_layer_norm = RMSNorm(self.inner_dim, layer_norm_eps, ln_elementwise_affine)
884
+ self.project_to_hidden_norm = RMSNorm(in_channels_embed, layer_norm_eps, ln_elementwise_affine)
885
+ self.project_to_hidden = nn.Linear(in_channels_embed, self.inner_dim, bias=use_bias)
886
+ self.project_from_hidden_norm = RMSNorm(self.inner_dim, layer_norm_eps, ln_elementwise_affine)
887
+ self.project_from_hidden = nn.Linear(self.inner_dim, in_channels_embed, bias=use_bias)
888
+
889
+ self.down_block = Simple_UVitBlock(
890
+ self.inner_dim,
891
+ ln_elementwise_affine,
892
+ layer_norm_eps,
893
+ use_bias,
894
+ downsample,
895
+ False,
896
+ )
897
+ self.up_block = Simple_UVitBlock(
898
+ self.inner_dim, #block_out_channels,
899
+ ln_elementwise_affine,
900
+ layer_norm_eps,
901
+ use_bias,
902
+ False,
903
+ upsample=upsample,
904
+ )
905
+
906
+ # self.fuse_qkv_projections()
907
+
908
+ @property
909
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
910
+ def attn_processors(self) -> Dict[str, AttentionProcessor]:
911
+ r"""
912
+ Returns:
913
+ `dict` of attention processors: A dictionary containing all attention processors used in the model with
914
+ indexed by its weight name.
915
+ """
916
+ # set recursively
917
+ processors = {}
918
+
919
+ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
920
+ if hasattr(module, "get_processor"):
921
+ processors[f"{name}.processor"] = module.get_processor()
922
+
923
+ for sub_name, child in module.named_children():
924
+ fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
925
+
926
+ return processors
927
+
928
+ for name, module in self.named_children():
929
+ fn_recursive_add_processors(name, module, processors)
930
+
931
+ return processors
932
+
933
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
934
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
935
+ r"""
936
+ Sets the attention processor to use to compute attention.
937
+
938
+ Parameters:
939
+ processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
940
+ The instantiated processor class or a dictionary of processor classes that will be set as the processor
941
+ for **all** `Attention` layers.
942
+
943
+ If `processor` is a dict, the key needs to define the path to the corresponding cross attention
944
+ processor. This is strongly recommended when setting trainable attention processors.
945
+
946
+ """
947
+ count = len(self.attn_processors.keys())
948
+
949
+ if isinstance(processor, dict) and len(processor) != count:
950
+ raise ValueError(
951
+ f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
952
+ f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
953
+ )
954
+
955
+ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
956
+ if hasattr(module, "set_processor"):
957
+ if not isinstance(processor, dict):
958
+ module.set_processor(processor)
959
+ else:
960
+ module.set_processor(processor.pop(f"{name}.processor"))
961
+
962
+ for sub_name, child in module.named_children():
963
+ fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
964
+
965
+ for name, module in self.named_children():
966
+ fn_recursive_attn_processor(name, module, processor)
967
+
968
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
969
+ def fuse_qkv_projections(self):
970
+ """
971
+ Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
972
+ are fused. For cross-attention modules, key and value projection matrices are fused.
973
+
974
+ <Tip warning={true}>
975
+
976
+ This API is 🧪 experimental.
977
+
978
+ </Tip>
979
+ """
980
+ self.original_attn_processors = None
981
+
982
+ for _, attn_processor in self.attn_processors.items():
983
+ if "Added" in str(attn_processor.__class__.__name__):
984
+ raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
985
+
986
+ self.original_attn_processors = self.attn_processors
987
+
988
+ for module in self.modules():
989
+ if isinstance(module, Attention):
990
+ module.fuse_projections(fuse=True)
991
+
992
+ self.set_attn_processor(FusedFluxAttnProcessor2_0())
993
+
994
+ # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
995
+ def unfuse_qkv_projections(self):
996
+ """Disables the fused QKV projection if enabled.
997
+
998
+ <Tip warning={true}>
999
+
1000
+ This API is 🧪 experimental.
1001
+
1002
+ </Tip>
1003
+
1004
+ """
1005
+ if self.original_attn_processors is not None:
1006
+ self.set_attn_processor(self.original_attn_processors)
1007
+
1008
+ def _set_gradient_checkpointing(self, module, value=False):
1009
+ if hasattr(module, "gradient_checkpointing"):
1010
+ module.gradient_checkpointing = value
1011
+
1012
+ def forward(
1013
+ self,
1014
+ hidden_states: torch.Tensor,
1015
+ encoder_hidden_states: torch.Tensor = None,
1016
+ pooled_projections: torch.Tensor = None,
1017
+ timestep: torch.LongTensor = None,
1018
+ img_ids: torch.Tensor = None,
1019
+ txt_ids: torch.Tensor = None,
1020
+ guidance: torch.Tensor = None,
1021
+ joint_attention_kwargs: Optional[Dict[str, Any]] = None,
1022
+ controlnet_block_samples= None,
1023
+ controlnet_single_block_samples=None,
1024
+ return_dict: bool = True,
1025
+ micro_conds: torch.Tensor = None,
1026
+ ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
1027
+ """
1028
+ The [`FluxTransformer2DModel`] forward method.
1029
+
1030
+ Args:
1031
+ hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
1032
+ Input `hidden_states`.
1033
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
1034
+ Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
1035
+ pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
1036
+ from the embeddings of input conditions.
1037
+ timestep ( `torch.LongTensor`):
1038
+ Used to indicate denoising step.
1039
+ block_controlnet_hidden_states: (`list` of `torch.Tensor`):
1040
+ A list of tensors that if specified are added to the residuals of transformer blocks.
1041
+ joint_attention_kwargs (`dict`, *optional*):
1042
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
1043
+ `self.processor` in
1044
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
1045
+ return_dict (`bool`, *optional*, defaults to `True`):
1046
+ Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
1047
+ tuple.
1048
+
1049
+ Returns:
1050
+ If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
1051
+ `tuple` where the first element is the sample tensor.
1052
+ """
1053
+ micro_cond_encode_dim = 256 # same as self.config.micro_cond_encode_dim = 256 from amused
1054
+ micro_cond_embeds = get_timestep_embedding(
1055
+ micro_conds.flatten(), micro_cond_encode_dim, flip_sin_to_cos=True, downscale_freq_shift=0
1056
+ )
1057
+ micro_cond_embeds = micro_cond_embeds.reshape((hidden_states.shape[0], -1))
1058
+
1059
+ pooled_projections = torch.cat([pooled_projections, micro_cond_embeds], dim=1)
1060
+ pooled_projections = pooled_projections.to(dtype=self.dtype)
1061
+ pooled_projections = self.cond_embed(pooled_projections).to(encoder_hidden_states.dtype)
1062
+
1063
+
1064
+ hidden_states = self.embed(hidden_states)
1065
+
1066
+ encoder_hidden_states = self.context_embedder(encoder_hidden_states)
1067
+ encoder_hidden_states = self.encoder_proj_layer_norm(encoder_hidden_states)
1068
+ hidden_states = self.down_block(hidden_states)
1069
+
1070
+ batch_size, channels, height, width = hidden_states.shape
1071
+ hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels)
1072
+ hidden_states = self.project_to_hidden_norm(hidden_states)
1073
+ hidden_states = self.project_to_hidden(hidden_states)
1074
+
1075
+
1076
+ if joint_attention_kwargs is not None:
1077
+ joint_attention_kwargs = joint_attention_kwargs.copy()
1078
+ lora_scale = joint_attention_kwargs.pop("scale", 1.0)
1079
+ else:
1080
+ lora_scale = 1.0
1081
+
1082
+ if USE_PEFT_BACKEND:
1083
+ # weight the lora layers by setting `lora_scale` for each PEFT layer
1084
+ scale_lora_layers(self, lora_scale)
1085
+ else:
1086
+ if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
1087
+ logger.warning(
1088
+ "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
1089
+ )
1090
+
1091
+ timestep = timestep.to(hidden_states.dtype) * 1000
1092
+ if guidance is not None:
1093
+ guidance = guidance.to(hidden_states.dtype) * 1000
1094
+ else:
1095
+ guidance = None
1096
+ temb = (
1097
+ self.time_text_embed(timestep, pooled_projections)
1098
+ if guidance is None
1099
+ else self.time_text_embed(timestep, guidance, pooled_projections)
1100
+ )
1101
+
1102
+ if txt_ids.ndim == 3:
1103
+ logger.warning(
1104
+ "Passing `txt_ids` 3d torch.Tensor is deprecated."
1105
+ "Please remove the batch dimension and pass it as a 2d torch Tensor"
1106
+ )
1107
+ txt_ids = txt_ids[0]
1108
+ if img_ids.ndim == 3:
1109
+ logger.warning(
1110
+ "Passing `img_ids` 3d torch.Tensor is deprecated."
1111
+ "Please remove the batch dimension and pass it as a 2d torch Tensor"
1112
+ )
1113
+ img_ids = img_ids[0]
1114
+ ids = torch.cat((txt_ids, img_ids), dim=0)
1115
+
1116
+ image_rotary_emb = self.pos_embed(ids)
1117
+
1118
+ for index_block, block in enumerate(self.transformer_blocks):
1119
+ if self.training and self.gradient_checkpointing:
1120
+
1121
+ def create_custom_forward(module, return_dict=None):
1122
+ def custom_forward(*inputs):
1123
+ if return_dict is not None:
1124
+ return module(*inputs, return_dict=return_dict)
1125
+ else:
1126
+ return module(*inputs)
1127
+
1128
+ return custom_forward
1129
+
1130
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
1131
+ encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
1132
+ create_custom_forward(block),
1133
+ hidden_states,
1134
+ encoder_hidden_states,
1135
+ temb,
1136
+ image_rotary_emb,
1137
+ **ckpt_kwargs,
1138
+ )
1139
+
1140
+ else:
1141
+ encoder_hidden_states, hidden_states = block(
1142
+ hidden_states=hidden_states,
1143
+ encoder_hidden_states=encoder_hidden_states,
1144
+ temb=temb,
1145
+ image_rotary_emb=image_rotary_emb,
1146
+ )
1147
+
1148
+
1149
+ # controlnet residual
1150
+ if controlnet_block_samples is not None:
1151
+ interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
1152
+ interval_control = int(np.ceil(interval_control))
1153
+ hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
1154
+
1155
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
1156
+
1157
+ for index_block, block in enumerate(self.single_transformer_blocks):
1158
+ if self.training and self.gradient_checkpointing:
1159
+
1160
+ def create_custom_forward(module, return_dict=None):
1161
+ def custom_forward(*inputs):
1162
+ if return_dict is not None:
1163
+ return module(*inputs, return_dict=return_dict)
1164
+ else:
1165
+ return module(*inputs)
1166
+
1167
+ return custom_forward
1168
+
1169
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
1170
+ hidden_states = torch.utils.checkpoint.checkpoint(
1171
+ create_custom_forward(block),
1172
+ hidden_states,
1173
+ temb,
1174
+ image_rotary_emb,
1175
+ **ckpt_kwargs,
1176
+ )
1177
+
1178
+ else:
1179
+ hidden_states = block(
1180
+ hidden_states=hidden_states,
1181
+ temb=temb,
1182
+ image_rotary_emb=image_rotary_emb,
1183
+ )
1184
+
1185
+ # controlnet residual
1186
+ if controlnet_single_block_samples is not None:
1187
+ interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
1188
+ interval_control = int(np.ceil(interval_control))
1189
+ hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
1190
+ hidden_states[:, encoder_hidden_states.shape[1] :, ...]
1191
+ + controlnet_single_block_samples[index_block // interval_control]
1192
+ )
1193
+
1194
+ hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
1195
+
1196
+
1197
+ hidden_states = self.project_from_hidden_norm(hidden_states)
1198
+ hidden_states = self.project_from_hidden(hidden_states)
1199
+
1200
+
1201
+ hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
1202
+
1203
+ hidden_states = self.up_block(hidden_states)
1204
+
1205
+ if USE_PEFT_BACKEND:
1206
+ # remove `lora_scale` from each PEFT layer
1207
+ unscale_lora_layers(self, lora_scale)
1208
+
1209
+ output = self.mlm_layer(hidden_states)
1210
+ # self.unfuse_qkv_projections()
1211
+ if not return_dict:
1212
+ return (output,)
1213
+
1214
+
1215
+ return output
vqvae/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "VQModel",
3
+ "_diffusers_version": "0.30.2",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 256,
9
+ 512,
10
+ 768
11
+ ],
12
+ "down_block_types": [
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D",
16
+ "DownEncoderBlock2D",
17
+ "DownEncoderBlock2D"
18
+ ],
19
+ "in_channels": 3,
20
+ "latent_channels": 64,
21
+ "layers_per_block": 2,
22
+ "lookup_from_codebook": true,
23
+ "mid_block_add_attention": false,
24
+ "norm_num_groups": 32,
25
+ "norm_type": "group",
26
+ "num_vq_embeddings": 8192,
27
+ "out_channels": 3,
28
+ "sample_size": 32,
29
+ "scaling_factor": 0.18215,
30
+ "up_block_types": [
31
+ "UpDecoderBlock2D",
32
+ "UpDecoderBlock2D",
33
+ "UpDecoderBlock2D",
34
+ "UpDecoderBlock2D",
35
+ "UpDecoderBlock2D"
36
+ ],
37
+ "vq_embed_dim": null,
38
+ "force_upcast": true
39
+ }
vqvae/diffusion_pytorch_model.fp16.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62ac839c4caebd5221d3c69a26ae76c057a2bb5b34ac59acec2e48ce4b9ae0a8
3
+ size 292520582
vqvae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1241a5c88b635af4f8cfb268e388ccaa70f55a458a473d68943e5c28d7b7f762
3
+ size 585009980