turbo_hc / inversion_run_adapter.py
zhiweili
only convert mask area to gray
012d7e8
raw
history blame
10.2 kB
import torch
from diffusers import (
DDPMScheduler,
DiffusionPipeline,
T2IAdapter,
MultiAdapter,
)
from controlnet_aux import (
LineartDetector,
CannyDetector,
MidasDetector,
PidiNetDetector,
)
from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img import retrieve_timesteps, retrieve_latents
from PIL import Image
from inversion_utils import get_ddpm_inversion_scheduler, create_xts
from config import get_config, get_num_steps_actual
from functools import partial
from compel import Compel, ReturnedEmbeddingsType
class Object(object):
pass
args = Object()
args.images_paths = None
args.images_folder = None
args.force_use_cpu = False
args.folder_name = 'test_measure_time'
args.config_from_file = 'run_configs/noise_shift_guidance_1_5.yaml'
args.save_intermediate_results = False
args.batch_size = None
args.skip_p_to_p = True
args.only_p_to_p = False
args.fp16 = False
args.prompts_file = 'dataset_measure_time/dataset.json'
args.images_in_prompts_file = None
args.seed = 986
args.time_measure_n = 1
assert (
args.batch_size is None or args.save_intermediate_results is False
), "save_intermediate_results is not implemented for batch_size > 1"
generator = None
device = "cuda" if torch.cuda.is_available() else "cpu"
# BASE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
BASE_MODEL = "stabilityai/sdxl-turbo"
# BASE_MODEL = "SG161222/RealVisXL_V5.0_Lightning"
# BASE_MODEL = "Lykon/dreamshaper-xl-v2-turbo"
# BASE_MODEL = "RunDiffusion/Juggernaut-XL-Lightning"
lineart_detector = LineartDetector.from_pretrained("lllyasviel/Annotators")
lineart_detector = lineart_detector.to(device)
pidinet_detector = PidiNetDetector.from_pretrained("lllyasviel/Annotators")
pidinet_detector = pidinet_detector.to(device)
canndy_detector = CannyDetector()
midas_detector = MidasDetector.from_pretrained(
"valhalla/t2iadapter-aux-models", filename="dpt_large_384.pt", model_type="dpt_large"
)
midas_detector = midas_detector.to(device)
adapters = MultiAdapter(
[
T2IAdapter.from_pretrained(
"TencentARC/t2i-adapter-lineart-sdxl-1.0",
torch_dtype=torch.float16,
varient="fp16",
),
T2IAdapter.from_pretrained(
"TencentARC/t2i-adapter-canny-sdxl-1.0",
torch_dtype=torch.float16,
varient="fp16",
),
# T2IAdapter.from_pretrained(
# "TencentARC/t2i-adapter-sketch-sdxl-1.0",
# torch_dtype=torch.float16,
# varient="fp16",
# ),
# T2IAdapter.from_pretrained(
# "TencentARC/t2i-adapter-depth-midas-sdxl-1.0",
# torch_dtype=torch.float16,
# varient="fp16",
# ),
]
)
adapters = adapters.to(torch.float16)
pipeline = DiffusionPipeline.from_pretrained(
BASE_MODEL,
torch_dtype=torch.float16,
variant="fp16",
use_safetensors=True,
adapter=adapters,
custom_pipeline="./pipelines/pipeline_sdxl_adapter_img2img.py",
)
pipeline = pipeline.to(device)
pipeline.scheduler = DDPMScheduler.from_pretrained(
BASE_MODEL,
subfolder="scheduler",
)
config = get_config(args)
compel_proc = Compel(
tokenizer=[pipeline.tokenizer, pipeline.tokenizer_2] ,
text_encoder=[pipeline.text_encoder, pipeline.text_encoder_2],
returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
requires_pooled=[False, True]
)
def run(
input_image:Image,
src_prompt:str,
tgt_prompt:str,
generate_size:int,
seed:int,
w1:float,
w2:float,
num_steps:int,
start_step:int,
guidance_scale:float,
lineart_scale:float = 0.5,
canny_scale:float = 0.5,
lineart_detect:float = 0.375,
canny_detect:float = 0.375,
):
generator = torch.Generator().manual_seed(seed)
config.num_steps_inversion = num_steps
config.step_start = start_step
num_steps_actual = get_num_steps_actual(config)
num_steps_inversion = config.num_steps_inversion
denoising_start = (num_steps_inversion - num_steps_actual) / num_steps_inversion
print(f"-------->num_steps_inversion: {num_steps_inversion} num_steps_actual: {num_steps_actual} denoising_start: {denoising_start}")
timesteps, num_inference_steps = retrieve_timesteps(
pipeline.scheduler, num_steps_inversion, device, None
)
timesteps, num_inference_steps = pipeline.get_timesteps(
num_inference_steps=num_inference_steps,
denoising_start=denoising_start,
strength=0,
device=device,
)
timesteps = timesteps.type(torch.int64)
timesteps = [torch.tensor(t) for t in timesteps.tolist()]
timesteps_len = len(timesteps)
config.step_start = start_step + num_steps_actual - timesteps_len
num_steps_actual = timesteps_len
config.max_norm_zs = [-1] * (num_steps_actual - 1) + [15.5]
print(f"-------->num_steps_inversion: {num_steps_inversion} num_steps_actual: {num_steps_actual} step_start: {config.step_start}")
print(f"-------->timesteps len: {len(timesteps)} max_norm_zs len: {len(config.max_norm_zs)}")
lineart_image = lineart_detector(input_image, detect_resolution=int(generate_size * lineart_detect), image_resolution=generate_size)
canny_image = canndy_detector(input_image, detect_resolution=int(generate_size * canny_detect), image_resolution=generate_size)
# pidinet_image = pidinet_detector(input_image, detect_resolution=512, image_resolution=generate_size, apply_filter=True)
# depth_image = midas_detector(input_image, detect_resolution=512, image_resolution=generate_size)
cond_image = [lineart_image, canny_image]
conditioning_scale = [lineart_scale, canny_scale]
pipeline.__call__ = partial(
pipeline.__call__,
num_inference_steps=num_steps_inversion,
guidance_scale=guidance_scale,
generator=generator,
denoising_start=denoising_start,
strength=0,
adapter_image=cond_image,
adapter_conditioning_scale=conditioning_scale,
)
x_0_image = input_image
x_0 = encode_image(x_0_image, pipeline)
x_ts = create_xts(1, None, 0, generator, pipeline.scheduler, timesteps, x_0, no_add_noise=False)
x_ts = [xt.to(dtype=torch.float16) for xt in x_ts]
latents = [x_ts[0]]
x_ts_c_hat = [None]
config.ws1 = [w1] * num_steps_actual
config.ws2 = [w2] * num_steps_actual
pipeline.scheduler = get_ddpm_inversion_scheduler(
pipeline.scheduler,
config.step_function,
config,
timesteps,
config.save_timesteps,
latents,
x_ts,
x_ts_c_hat,
args.save_intermediate_results,
pipeline,
x_0,
v1s_images := [],
v2s_images := [],
deltas_images := [],
v1_x0s := [],
v2_x0s := [],
deltas_x0s := [],
"res12",
image_name="im_name",
time_measure_n=args.time_measure_n,
)
latent = latents[0].expand(3, -1, -1, -1)
prompt = [src_prompt, src_prompt, tgt_prompt]
conditioning, pooled = compel_proc(prompt)
image = pipeline.__call__(
image=latent,
prompt_embeds=conditioning,
pooled_prompt_embeds=pooled,
eta=1,
).images
return image[2]
def encode_image(image, pipe):
image = pipe.image_processor.preprocess(image)
originDtype = pipe.dtype
image = image.to(device=device, dtype=originDtype)
if pipe.vae.config.force_upcast:
image = image.float()
pipe.vae.to(dtype=torch.float32)
if isinstance(generator, list):
init_latents = [
retrieve_latents(pipe.vae.encode(image[i : i + 1]), generator=generator[i])
for i in range(1)
]
init_latents = torch.cat(init_latents, dim=0)
else:
init_latents = retrieve_latents(pipe.vae.encode(image), generator=generator)
if pipe.vae.config.force_upcast:
pipe.vae.to(originDtype)
init_latents = init_latents.to(originDtype)
init_latents = pipe.vae.config.scaling_factor * init_latents
return init_latents.to(dtype=torch.float16)
def get_timesteps(pipe, num_inference_steps, strength, device, denoising_start=None):
# get the original timestep using init_timestep
if denoising_start is None:
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
else:
t_start = 0
timesteps = pipe.scheduler.timesteps[t_start * pipe.scheduler.order :]
# Strength is irrelevant if we directly request a timestep to start at;
# that is, strength is determined by the denoising_start instead.
if denoising_start is not None:
discrete_timestep_cutoff = int(
round(
pipe.scheduler.config.num_train_timesteps
- (denoising_start * pipe.scheduler.config.num_train_timesteps)
)
)
num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
if pipe.scheduler.order == 2 and num_inference_steps % 2 == 0:
# if the scheduler is a 2nd order scheduler we might have to do +1
# because `num_inference_steps` might be even given that every timestep
# (except the highest one) is duplicated. If `num_inference_steps` is even it would
# mean that we cut the timesteps in the middle of the denoising step
# (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
# we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
num_inference_steps = num_inference_steps + 1
# because t_n+1 >= t_n, we slice the timesteps starting from the end
timesteps = timesteps[-num_inference_steps:]
return timesteps, num_inference_steps
return timesteps, num_inference_steps - t_start