Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,705 Bytes
7aafe2f 03e077d 7aafe2f 03e077d 7aafe2f 03e077d 7aafe2f 8ab26aa 7aafe2f 03e077d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import torch
import numpy as np
from diffusers import FluxPipeline, FlowMatchEulerDiscreteScheduler
from typing import Any, Dict, List, Optional, Union
from PIL import Image
# Constants for shift calculation
BASE_SEQ_LEN = 256
MAX_SEQ_LEN = 4096
BASE_SHIFT = 0.5
MAX_SHIFT = 1.2
# Helper functions
def calculate_timestep_shift(image_seq_len: int) -> float:
"""Calculates the timestep shift (mu) based on the image sequence length."""
m = (MAX_SHIFT - BASE_SHIFT) / (MAX_SEQ_LEN - BASE_SEQ_LEN)
b = BASE_SHIFT - m * BASE_SEQ_LEN
mu = image_seq_len * m + b
return mu
def prepare_timesteps(
scheduler: FlowMatchEulerDiscreteScheduler,
num_inference_steps: Optional[int] = None,
device: Optional[Union[str, torch.device]] = None,
timesteps: Optional[List[int]] = None,
sigmas: Optional[List[float]] = None,
mu: Optional[float] = None,
) -> (torch.Tensor, int):
"""Prepares the timesteps for the diffusion process."""
if timesteps is not None and sigmas is not None:
raise ValueError("Only one of `timesteps` or `sigmas` can be passed.")
if timesteps is not None:
scheduler.set_timesteps(timesteps=timesteps, device=device)
elif sigmas is not None:
scheduler.set_timesteps(sigmas=sigmas, device=device)
else:
scheduler.set_timesteps(num_inference_steps, device=device, mu=mu)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
return timesteps, num_inference_steps
# FLUX pipeline function
class HighSpeedFluxPipeline(FluxPipeline):
"""
Extends the FluxPipeline to yield intermediate images during the denoising process
with progressively increasing resolution for faster generation.
"""
@torch.inference_mode()
def generate_images(
self,
prompt: Union[str, List[str]] = None,
prompt_2: Optional[Union[str, List[str]]] = None,
height: Optional[int] = None,
width: Optional[int] = None,
num_inference_steps: int = 4,
timesteps: List[int] = None,
num_images_per_prompt: Optional[int] = 1,
generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
latents: Optional[torch.FloatTensor] = None,
prompt_embeds: Optional[torch.FloatTensor] = None,
pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
output_type: Optional[str] = "pil",
return_dict: bool = True,
max_sequence_length: int = 128,
):
"""Generates images and yields intermediate results during the denoising process."""
height = height or self.default_sample_size * self.vae_scale_factor
width = width or self.default_sample_size * self.vae_scale_factor
# 1. Check inputs
self.check_inputs(
prompt,
prompt_2,
height,
width,
prompt_embeds=prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
max_sequence_length=max_sequence_length,
)
# 2. Define call parameters
batch_size = 1 if isinstance(prompt, str) else len(prompt)
device = self._execution_device
prompt_embeds, pooled_prompt_embeds, text_ids = self.encode_prompt(
prompt=prompt,
prompt_2=prompt_2,
prompt_embeds=prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
device=device,
num_images_per_prompt=num_images_per_prompt,
max_sequence_length=max_sequence_length,
)
# 4. Prepare latent variables
num_channels_latents = self.transformer.config.in_channels // 4
latents, latent_image_ids = self.prepare_latents(
batch_size * num_images_per_prompt,
num_channels_latents,
height,
width,
prompt_embeds.dtype,
device,
generator,
latents,
)
# 5. Prepare timesteps
sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
image_seq_len = latents.shape[1]
mu = calculate_timestep_shift(image_seq_len)
timesteps, num_inference_steps = prepare_timesteps(
self.scheduler,
num_inference_steps,
device,
timesteps,
sigmas,
mu=mu,
)
self._num_timesteps = len(timesteps)
# 6. Denoising loop
for i, t in enumerate(timesteps):
timestep = t.expand(latents.shape[0]).to(latents.dtype)
noise_pred = self.transformer(
hidden_states=latents,
timestep=timestep / 1000,
pooled_projections=pooled_prompt_embeds,
encoder_hidden_states=prompt_embeds,
txt_ids=text_ids,
img_ids=latent_image_ids,
return_dict=False,
)[0]
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
torch.cuda.empty_cache()
# Final image
return self._decode_latents_to_image(latents, height, width, output_type)
self.maybe_free_model_hooks()
torch.cuda.empty_cache()
def _decode_latents_to_image(self, latents, height, width, output_type, vae=None):
"""Decodes the given latents into an image."""
vae = vae or self.vae
latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
image = vae.decode(latents, return_dict=False)[0]
return self.image_processor.postprocess(image, output_type=output_type)[0] |