Integration Testing against original implementation

#1
by patrickvonplaten - opened

This thread quickly shows that the diffusers implementation and the original implementation as described here are equivalent besides some small numerical differences which is due to different attention layers and precision being used.

I used the default values as advertised in here and the prompt "blue bird", created the following Python script:

from share import *
import config

import cv2
import einops
import numpy as np
import torch
import random

from pytorch_lightning import seed_everything
from annotator.util import resize_image, HWC3
from annotator.canny import CannyDetector
from cldm.model import create_model, load_state_dict
from cldm.ddim_hacked import DDIMSampler
import PIL


preprocessor = None

model_name = 'control_v11p_sd15_canny'

model = create_model(f'/home/patrick/controlnet_v1_1/ControlNet-v1-1/{model_name}.yaml').cpu()
model.load_state_dict(load_state_dict('/home/patrick/controlnet_v1_1/v1-5-pruned.ckpt', location='cuda'), strict=False)
model.load_state_dict(load_state_dict(f'/home/patrick/controlnet_v1_1/ControlNet-v1-1/{model_name}.pth', location='cuda'), strict=False)
model = model.cuda()
ddim_sampler = DDIMSampler(model)

def process(input_image, prompt, a_prompt=None, n_prompt="", num_samples=1, image_resolution=512, detect_resolution=512, ddim_steps=50, guess_mode=False, strength=1.0, scale=9.0, seed=0, eta=1.0, low_threshold=100, high_threshold=200):
    global preprocessor

    det = "Canny"
    if det == 'Canny':
        if not isinstance(preprocessor, CannyDetector):
            preprocessor = CannyDetector()

    with torch.no_grad():
        input_image = HWC3(input_image)

        if det == 'None':
            detected_map = input_image.copy()
        else:
            detected_map = preprocessor(resize_image(input_image, detect_resolution), low_threshold, high_threshold)
            detected_map = HWC3(detected_map)

        img = resize_image(input_image, image_resolution)
        H, W, C = img.shape

        detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)

        control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
        control = torch.stack([control for _ in range(num_samples)], dim=0)
        control = einops.rearrange(control, 'b h w c -> b c h w').clone()

        if seed == -1:
            seed = random.randint(0, 65535)
        seed_everything(seed)

        if config.save_memory:
            model.low_vram_shift(is_diffusing=False)

        cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
        un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
        shape = (4, H // 8, W // 8)

        if config.save_memory:
            model.low_vram_shift(is_diffusing=True)

        model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
        # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01

        samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
                                                     shape, cond, verbose=False, eta=eta,
                                                     unconditional_guidance_scale=scale,
                                                     unconditional_conditioning=un_cond)

        if config.save_memory:
            model.low_vram_shift(is_diffusing=False)

        x_samples = model.decode_first_stage(samples)
        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)

        results = [x_samples[i] for i in range(num_samples)]
    return [detected_map] + results

input_image = PIL.Image.open("/home/patrick/controlnet_v1_1/control_v11p_sd15_canny/images/input.png")
input_image = np.asarray(input_image, dtype=np.uint8)
prompt = "red bird"

output = process(
    input_image,
    prompt,
    eta=1.0,
    ddim_steps=20,
    scale=9.0,
    seed=12345,
    a_prompt="best quality",
    n_prompt="lowres, bad anatomy, bad hands, cropped, worst quality"
)




img_1 = PIL.Image.fromarray(output[0])
img_2 = PIL.Image.fromarray(output[1])

img_1.save("/home/patrick/images/canny_orig_1.png")
img_2.save("/home/patrick/images/canny_orig_2.png")

Running this script gives me the following two images:
canny_orig_1.png
canny_orig_2.png

Now doing more or less the equivalent with `diffusers:

from pytorch_lightning import seed_everything
import torch
import os
from huggingface_hub import HfApi
from annotator.util import resize_image, HWC3
from pathlib import Path
from diffusers.utils import load_image
from diffusers.models.attention_processor import AttnProcessor
import numpy as np
import cv2
from PIL import Image

from diffusers import (
    ControlNetModel,
    StableDiffusionControlNetPipeline,
    DDIMScheduler,
)

checkpoint = "lllyasviel/control_v11p_sd15_canny"

image = load_image(
    "https://huggingface.co/lllyasviel/control_v11p_sd15_canny/resolve/main/images/input.png"
)

image = np.array(image)

image = resize_image(image, 512)

low_threshold = 100
high_threshold = 200

image = cv2.Canny(image, low_threshold, high_threshold)
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
control_image = Image.fromarray(image)

controlnet = ControlNetModel.from_pretrained(checkpoint, torch_dtype=torch.float16)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")

prompt = "bird, best quality"
seed = 12345
negative_prompt = "lowres, bad anatomy, bad hands, cropped, worst quality"

seed_everything(seed)
image = pipe(prompt, num_inference_steps=20, negative_prompt=negative_prompt, image=control_image, guidance_scale=9.0, eta=1.0).images[0]

control_image.save("/home/patrick/images/canny_diff_1.png")
image.save("/home/patrick/images/canny_diff_2.png")

We get:
canny_diff_1.png
canny_diff_2.png

As one can see there are some differences: The diffusers version has some green feathers while the original one does not. Apart from this, I would argue that the images are extremely similar for such a non-specific prompt. The more specific the prompt, the more similar the pictures would be.

I'm running the examples on PyTorch 2.0 and on a RTX4090 GPU and use "pure" float16 optimization for diffusers. Also diffusers makes use of PyTorch's Accelerated Attention while the original code seems to run using "normal" PyTorch attention and float32.

Sign up or log in to comment