Spaces:

smartfeed
/

turbo_hc

Sleeping

File size: 10,603 Bytes

import torch

from diffusers import (
    DDPMScheduler,
    DiffusionPipeline,
    T2IAdapter,
    MultiAdapter,
    AutoencoderKL,
)
from controlnet_aux import (
    LineartDetector,
    CannyDetector,
    MidasDetector,
    PidiNetDetector,
)
from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img import retrieve_timesteps, retrieve_latents
from PIL import Image, ImageFilter
from inversion_utils import get_ddpm_inversion_scheduler, create_xts
from config import get_config, get_num_steps_actual
from functools import partial
from compel import Compel, ReturnedEmbeddingsType

class Object(object):
    pass

args = Object()
args.images_paths = None
args.images_folder = None
args.force_use_cpu = False
args.folder_name = 'test_measure_time'
args.config_from_file = 'run_configs/noise_shift_guidance_1_5.yaml'
args.save_intermediate_results = False
args.batch_size = None
args.skip_p_to_p = True
args.only_p_to_p = False
args.fp16 = False
args.prompts_file = 'dataset_measure_time/dataset.json'
args.images_in_prompts_file = None
args.seed = 986
args.time_measure_n = 1


assert (
    args.batch_size is None or args.save_intermediate_results is False
), "save_intermediate_results is not implemented for batch_size > 1"

generator = None
device = "cuda" if torch.cuda.is_available() else "cpu"

# BASE_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"
BASE_MODEL = "stabilityai/sdxl-turbo"
# BASE_MODEL = "SG161222/RealVisXL_V5.0_Lightning"
# BASE_MODEL = "Lykon/dreamshaper-xl-v2-turbo"
# BASE_MODEL = "RunDiffusion/Juggernaut-XL-Lightning"

lineart_detector = LineartDetector.from_pretrained("lllyasviel/Annotators")
lineart_detector = lineart_detector.to(device)

pidinet_detector = PidiNetDetector.from_pretrained("lllyasviel/Annotators")
pidinet_detector = pidinet_detector.to(device)

canndy_detector = CannyDetector()

midas_detector = MidasDetector.from_pretrained(
    "valhalla/t2iadapter-aux-models", filename="dpt_large_384.pt", model_type="dpt_large"
)
midas_detector = midas_detector.to(device)

adapters = MultiAdapter(
    [
        T2IAdapter.from_pretrained(
            "TencentARC/t2i-adapter-lineart-sdxl-1.0",
            torch_dtype=torch.float16,
            varient="fp16",
        ),
        T2IAdapter.from_pretrained(
            "TencentARC/t2i-adapter-canny-sdxl-1.0",
            torch_dtype=torch.float16,
            varient="fp16",
        ),
        # T2IAdapter.from_pretrained(
        #     "TencentARC/t2i-adapter-sketch-sdxl-1.0",
        #     torch_dtype=torch.float16,
        #     varient="fp16",
        # ),
        # T2IAdapter.from_pretrained(
        #     "TencentARC/t2i-adapter-depth-midas-sdxl-1.0",
        #     torch_dtype=torch.float16,
        #     varient="fp16",
        # ),
    ]
)
adapters = adapters.to(torch.float16)

pipeline = DiffusionPipeline.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16, 
    variant="fp16", 
    use_safetensors=True,
    vae=AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16),
    adapter=adapters,
    custom_pipeline="./pipelines/pipeline_sdxl_adapter_img2img.py",
)
pipeline = pipeline.to(device)

pipeline.scheduler = DDPMScheduler.from_pretrained(
    BASE_MODEL,
    subfolder="scheduler",
)

config = get_config(args)

compel_proc = Compel(
  tokenizer=[pipeline.tokenizer, pipeline.tokenizer_2] ,
  text_encoder=[pipeline.text_encoder, pipeline.text_encoder_2],
  returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
  requires_pooled=[False, True]
)

def run(
    input_image:Image,
    mask_image:Image,
    src_prompt:str,
    tgt_prompt:str,
    generate_size:int,
    seed:int,
    w1:float,
    w2:float,
    num_steps:int,
    start_step:int,
    guidance_scale:float,
    strength:float = 0.3,
    lineart_scale:float = 0.5,
    canny_scale:float = 0.5,
    lineart_detect:float = 0.375,
    canny_detect:float = 0.375,
    blur_radius:int = 40,
):
    generator = torch.Generator().manual_seed(seed)

    config.num_steps_inversion = num_steps
    config.step_start = start_step
    num_steps_actual = get_num_steps_actual(config)
    

    num_steps_inversion = config.num_steps_inversion
    denoising_start = (num_steps_inversion - num_steps_actual) / num_steps_inversion
    print(f"-------->num_steps_inversion: {num_steps_inversion} num_steps_actual: {num_steps_actual} denoising_start: {denoising_start}")
    
    timesteps, num_inference_steps = retrieve_timesteps(
        pipeline.scheduler, num_steps_inversion, device, None
    )
    timesteps, num_inference_steps = pipeline.get_timesteps(
        num_inference_steps=num_inference_steps,
        denoising_start=denoising_start,
        strength=strength,
        device=device,
    )
    timesteps = timesteps.type(torch.int64)

    timesteps = [torch.tensor(t) for t in timesteps.tolist()]
    timesteps_len = len(timesteps)
    config.step_start = start_step + num_steps_actual - timesteps_len
    num_steps_actual = timesteps_len
    config.max_norm_zs = [-1] * (num_steps_actual - 1) + [15.5]
    print(f"-------->num_steps_inversion: {num_steps_inversion} num_steps_actual: {num_steps_actual} step_start: {config.step_start}")
    print(f"-------->timesteps len: {len(timesteps)} max_norm_zs len: {len(config.max_norm_zs)}")
    lineart_image = lineart_detector(input_image, detect_resolution=int(generate_size * lineart_detect), image_resolution=generate_size)
    canny_image = canndy_detector(input_image, detect_resolution=int(generate_size * canny_detect), image_resolution=generate_size)
    # pidinet_image = pidinet_detector(input_image, detect_resolution=512, image_resolution=generate_size, apply_filter=True)
    # depth_image = midas_detector(input_image, detect_resolution=512, image_resolution=generate_size)
    cond_image = [lineart_image, canny_image]
    conditioning_scale = [lineart_scale, canny_scale]
    
    mask_image = mask_image.convert("L")
    blur = ImageFilter.GaussianBlur(blur_radius)
    input_image = Image.composite(input_image.filter(blur), input_image, mask_image.filter(blur))

    pipeline.__call__ = partial(
        pipeline.__call__,
        num_inference_steps=num_steps_inversion,
        guidance_scale=guidance_scale,
        generator=generator,
        denoising_start=denoising_start,
        strength=strength,
        adapter_image=cond_image,
        adapter_conditioning_scale=conditioning_scale,
    )

    x_0_image = input_image
    x_0 = encode_image(x_0_image, pipeline)
    x_ts = create_xts(1, None, 0, generator, pipeline.scheduler, timesteps, x_0, no_add_noise=False)
    x_ts = [xt.to(dtype=torch.float16) for xt in x_ts]
    latents = [x_ts[0]]
    x_ts_c_hat = [None]
    config.ws1 = [w1] * num_steps_actual
    config.ws2 = [w2] * num_steps_actual
    pipeline.scheduler = get_ddpm_inversion_scheduler(
                    pipeline.scheduler,
                    config.step_function,
                    config,
                    timesteps,
                    config.save_timesteps,
                    latents,
                    x_ts,
                    x_ts_c_hat,
                    args.save_intermediate_results,
                    pipeline,
                    x_0,
                    v1s_images := [],
                    v2s_images := [],
                    deltas_images := [],
                    v1_x0s := [],
                    v2_x0s := [],
                    deltas_x0s := [],
                    "res12",
                    image_name="im_name",
                    time_measure_n=args.time_measure_n,
                )
    latent = latents[0].expand(3, -1, -1, -1)
    prompt = [src_prompt, src_prompt, tgt_prompt]
    conditioning, pooled = compel_proc(prompt)

    image = pipeline.__call__(
        image=latent,
        prompt_embeds=conditioning,
        pooled_prompt_embeds=pooled,
        eta=1,
    ).images
    return image[2]

def encode_image(image, pipe):
    image = pipe.image_processor.preprocess(image)
    originDtype = pipe.dtype
    image = image.to(device=device, dtype=originDtype)

    if pipe.vae.config.force_upcast:
        image = image.float()
        pipe.vae.to(dtype=torch.float32)

    if isinstance(generator, list):
        init_latents = [
            retrieve_latents(pipe.vae.encode(image[i : i + 1]), generator=generator[i])
            for i in range(1)
        ]
        init_latents = torch.cat(init_latents, dim=0)
    else:
        init_latents = retrieve_latents(pipe.vae.encode(image), generator=generator)

    if pipe.vae.config.force_upcast:
        pipe.vae.to(originDtype)

    init_latents = init_latents.to(originDtype)
    init_latents = pipe.vae.config.scaling_factor * init_latents

    return init_latents.to(dtype=torch.float16)

def get_timesteps(pipe, num_inference_steps, strength, device, denoising_start=None):
    # get the original timestep using init_timestep
    if denoising_start is None:
        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
        t_start = max(num_inference_steps - init_timestep, 0)
    else:
        t_start = 0

    timesteps = pipe.scheduler.timesteps[t_start * pipe.scheduler.order :]

    # Strength is irrelevant if we directly request a timestep to start at;
    # that is, strength is determined by the denoising_start instead.
    if denoising_start is not None:
        discrete_timestep_cutoff = int(
            round(
                pipe.scheduler.config.num_train_timesteps
                - (denoising_start * pipe.scheduler.config.num_train_timesteps)
            )
        )

        num_inference_steps = (timesteps < discrete_timestep_cutoff).sum().item()
        if pipe.scheduler.order == 2 and num_inference_steps % 2 == 0:
            # if the scheduler is a 2nd order scheduler we might have to do +1
            # because `num_inference_steps` might be even given that every timestep
            # (except the highest one) is duplicated. If `num_inference_steps` is even it would
            # mean that we cut the timesteps in the middle of the denoising step
            # (between 1st and 2nd derivative) which leads to incorrect results. By adding 1
            # we ensure that the denoising process always ends after the 2nd derivate step of the scheduler
            num_inference_steps = num_inference_steps + 1

        # because t_n+1 >= t_n, we slice the timesteps starting from the end
        timesteps = timesteps[-num_inference_steps:]
        return timesteps, num_inference_steps

    return timesteps, num_inference_steps - t_start