Switti / models /pipeline.py
realantonvoronov
disable early cfg
fc4c596
raw
history blame
9.57 kB
import torch
from torchvision.transforms import ToPILImage
from PIL.Image import Image as PILImage
from models.vqvae import VQVAEHF
from models.clip import FrozenCLIPEmbedder
from models.switti import SwittiHF, get_crop_condition
from models.helpers import sample_with_top_k_top_p_, gumbel_softmax_with_rng
class SwittiPipeline:
vae_path = "yresearch/VQVAE-Switti"
text_encoder_path = "openai/clip-vit-large-patch14"
text_encoder_2_path = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
def __init__(self, switti, vae, text_encoder, text_encoder_2, device,
dtype=torch.bfloat16,
):
self.switti = switti.to(dtype)
self.vae = vae.to(dtype)
self.text_encoder = text_encoder.to(dtype)
self.text_encoder_2 = text_encoder_2.to(dtype)
self.switti.eval()
self.vae.eval()
self.device = device
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, device="cuda"):
switti = SwittiHF.from_pretrained(pretrained_model_name_or_path).to(device)
vae = VQVAEHF.from_pretrained(cls.vae_path).to(device)
text_encoder = FrozenCLIPEmbedder(cls.text_encoder_path, device=device)
text_encoder_2 = FrozenCLIPEmbedder(cls.text_encoder_2_path, device=device)
return cls(switti, vae, text_encoder, text_encoder_2, device)
@staticmethod
def to_image(tensor):
return [ToPILImage()(
(255 * img.cpu().detach()).to(torch.uint8))
for img in tensor]
def _encode_prompt(self, prompt: str | list[str]):
prompt = [prompt] if isinstance(prompt, str) else prompt
encodings = [
self.text_encoder.encode(prompt),
self.text_encoder_2.encode(prompt),
]
prompt_embeds = torch.concat(
[encoding.last_hidden_state for encoding in encodings], dim=-1
)
pooled_prompt_embeds = encodings[-1].pooler_output
attn_bias = encodings[-1].attn_bias
return prompt_embeds, pooled_prompt_embeds, attn_bias
def encode_prompt(
self,
prompt: str | list[str],
null_prompt: str = "",
encode_null: bool = True,
):
prompt_embeds, pooled_prompt_embeds, attn_bias = self._encode_prompt(prompt)
if encode_null:
B, L, hidden_dim = prompt_embeds.shape
pooled_dim = pooled_prompt_embeds.shape[1]
null_embeds, null_pooled_embeds, null_attn_bias = self._encode_prompt(null_prompt)
null_embeds = null_embeds[:, :L].expand(B, L, hidden_dim).to(prompt_embeds.device)
null_pooled_embeds = null_pooled_embeds.expand(B, pooled_dim).to(pooled_prompt_embeds.device)
null_attn_bias = null_attn_bias[:, :L].expand(B, L).to(attn_bias.device)
prompt_embeds = torch.cat([prompt_embeds, null_embeds], dim=0)
pooled_prompt_embeds = torch.cat([pooled_prompt_embeds, null_pooled_embeds], dim=0)
attn_bias = torch.cat([attn_bias, null_attn_bias], dim=0)
return prompt_embeds, pooled_prompt_embeds, attn_bias
@torch.inference_mode()
def __call__(
self,
prompt: str | list[str],
null_prompt: str = "",
seed: int | None = None,
cfg: float = 4.0,
top_k: int = 400,
top_p: float = 0.95,
more_smooth: bool = False,
return_pil: bool = True,
smooth_start_si: int = 0,
turn_off_cfg_start_si: int = 10,
turn_on_cfg_start_si: int = 0,
image_size: tuple[int, int] = (512, 512),
last_scale_temp: None | float = None,
) -> torch.Tensor | list[PILImage]:
"""
only used for inference, on autoregressive mode
:param prompt: text prompt to generate an image
:param null_prompt: negative prompt for CFG
:param seed: random seed
:param cfg: classifier-free guidance ratio
:param top_k: top-k sampling
:param top_p: top-p sampling
:param more_smooth: sampling using gumbel softmax; only used in visualization, not used in FID/IS benchmarking
:return: if return_pil: list of PIL Images, else: torch.tensor (B, 3, H, W) in [0, 1]
"""
assert not self.switti.training
switti = self.switti
vae = self.vae
vae_quant = self.vae.quantize
if seed is None:
rng = None
else:
switti.rng.manual_seed(seed)
rng = switti.rng
context, cond_vector, context_attn_bias = self.encode_prompt(prompt, null_prompt)
B = context.shape[0] // 2
cond_vector = switti.text_pooler(cond_vector)
if switti.use_crop_cond:
crop_coords = get_crop_condition(2 * B * [image_size[0]],
2 * B * [image_size[1]],
).to(cond_vector.device)
crop_embed = switti.crop_embed(crop_coords.view(-1)).reshape(2 * B, switti.D)
crop_cond = switti.crop_proj(crop_embed)
else:
crop_cond = None
sos = cond_BD = cond_vector
lvl_pos = switti.lvl_embed(switti.lvl_1L)
if not switti.rope:
lvl_pos += switti.pos_1LC
next_token_map = (
sos.unsqueeze(1)
+ switti.pos_start.expand(2 * B, switti.first_l, -1)
+ lvl_pos[:, : switti.first_l]
)
cur_L = 0
f_hat = sos.new_zeros(B, switti.Cvae, switti.patch_nums[-1], switti.patch_nums[-1])
for b in switti.blocks:
b.attn.kv_caching(switti.use_ar) # Use KV caching if switti is in the AR mode
b.cross_attn.kv_caching(True)
for si, pn in enumerate(switti.patch_nums): # si: i-th segment
ratio = si / switti.num_stages_minus_1
x_BLC = next_token_map
if switti.rope:
freqs_cis = switti.freqs_cis[:, cur_L : cur_L + pn * pn]
else:
freqs_cis = switti.freqs_cis
if si >= turn_off_cfg_start_si:
apply_smooth = False
x_BLC = x_BLC[:B]
context = context[:B]
context_attn_bias = context_attn_bias[:B]
freqs_cis = freqs_cis[:B]
cond_BD = cond_BD[:B]
if crop_cond is not None:
crop_cond = crop_cond[:B]
for b in switti.blocks:
if b.attn.caching and b.attn.cached_k is not None:
b.attn.cached_k = b.attn.cached_k[:B]
b.attn.cached_v = b.attn.cached_v[:B]
if b.cross_attn.caching and b.cross_attn.cached_k is not None:
b.cross_attn.cached_k = b.cross_attn.cached_k[:B]
b.cross_attn.cached_v = b.cross_attn.cached_v[:B]
else:
apply_smooth = more_smooth
for block in switti.blocks:
x_BLC = block(
x=x_BLC,
cond_BD=cond_BD,
attn_bias=None,
context=context,
context_attn_bias=context_attn_bias,
freqs_cis=freqs_cis,
crop_cond=crop_cond,
)
cur_L += pn * pn
logits_BlV = switti.get_logits(x_BLC, cond_BD)
# Guidance
if si < turn_on_cfg_start_si:
# t = 0, i. e. no guidance
logits_BlV = logits_BlV[:B]
elif si >= turn_on_cfg_start_si and si < turn_off_cfg_start_si:
# default const cfg
t = cfg
logits_BlV = (1 + t) * logits_BlV[:B] - t * logits_BlV[B:]
elif last_scale_temp is not None:
logits_BlV = logits_BlV / last_scale_temp
if apply_smooth and si >= smooth_start_si:
# not used when evaluating FID/IS/Precision/Recall
gum_t = max(0.27 * (1 - ratio * 0.95), 0.005) # refer to mask-git
idx_Bl = gumbel_softmax_with_rng(
logits_BlV.mul(1 + ratio), tau=gum_t, hard=False, dim=-1, rng=rng,
)
h_BChw = idx_Bl @ vae_quant.embedding.weight.unsqueeze(0)
else:
# defaul nucleus sampling
idx_Bl = sample_with_top_k_top_p_(
logits_BlV, rng=rng, top_k=top_k, top_p=top_p, num_samples=1,
)[:, :, 0]
h_BChw = vae_quant.embedding(idx_Bl)
h_BChw = h_BChw.transpose_(1, 2).reshape(B, switti.Cvae, pn, pn)
f_hat, next_token_map = vae_quant.get_next_autoregressive_input(
si, len(switti.patch_nums), f_hat, h_BChw,
)
if si != switti.num_stages_minus_1: # prepare for next stage
next_token_map = next_token_map.view(B, switti.Cvae, -1).transpose(1, 2)
next_token_map = (
switti.word_embed(next_token_map)
+ lvl_pos[:, cur_L : cur_L + switti.patch_nums[si + 1] ** 2]
)
# double the batch sizes due to CFG
next_token_map = next_token_map.repeat(2, 1, 1)
for b in switti.blocks:
b.attn.kv_caching(False)
b.cross_attn.kv_caching(False)
# de-normalize, from [-1, 1] to [0, 1]
img = vae.fhat_to_img(f_hat).add(1).mul(0.5)
if return_pil:
img = self.to_image(img)
return img