Integration Testing against original implementation
This thread quickly shows that the diffusers
implementation and the original implementation as described here are equivalent besides some small numerical differences which is due to different attention layers and precision being used.
I used the default values as advertised in here and the prompt "blue bird", created the following Python script:
from share import *
import config
import cv2
import einops
import numpy as np
import torch
import random
from pytorch_lightning import seed_everything
from annotator.util import resize_image, HWC3
from annotator.canny import CannyDetector
from cldm.model import create_model, load_state_dict
from cldm.ddim_hacked import DDIMSampler
import PIL
preprocessor = None
model_name = 'control_v11p_sd15_canny'
model = create_model(f'/home/patrick/controlnet_v1_1/ControlNet-v1-1/{model_name}.yaml').cpu()
model.load_state_dict(load_state_dict('/home/patrick/controlnet_v1_1/v1-5-pruned.ckpt', location='cuda'), strict=False)
model.load_state_dict(load_state_dict(f'/home/patrick/controlnet_v1_1/ControlNet-v1-1/{model_name}.pth', location='cuda'), strict=False)
model = model.cuda()
ddim_sampler = DDIMSampler(model)
def process(input_image, prompt, a_prompt=None, n_prompt="", num_samples=1, image_resolution=512, detect_resolution=512, ddim_steps=50, guess_mode=False, strength=1.0, scale=9.0, seed=0, eta=1.0, low_threshold=100, high_threshold=200):
global preprocessor
det = "Canny"
if det == 'Canny':
if not isinstance(preprocessor, CannyDetector):
preprocessor = CannyDetector()
with torch.no_grad():
input_image = HWC3(input_image)
if det == 'None':
detected_map = input_image.copy()
else:
detected_map = preprocessor(resize_image(input_image, detect_resolution), low_threshold, high_threshold)
detected_map = HWC3(detected_map)
img = resize_image(input_image, image_resolution)
H, W, C = img.shape
detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
control = torch.from_numpy(detected_map.copy()).float().cuda() / 255.0
control = torch.stack([control for _ in range(num_samples)], dim=0)
control = einops.rearrange(control, 'b h w c -> b c h w').clone()
if seed == -1:
seed = random.randint(0, 65535)
seed_everything(seed)
if config.save_memory:
model.low_vram_shift(is_diffusing=False)
cond = {"c_concat": [control], "c_crossattn": [model.get_learned_conditioning([prompt + ', ' + a_prompt] * num_samples)]}
un_cond = {"c_concat": None if guess_mode else [control], "c_crossattn": [model.get_learned_conditioning([n_prompt] * num_samples)]}
shape = (4, H // 8, W // 8)
if config.save_memory:
model.low_vram_shift(is_diffusing=True)
model.control_scales = [strength * (0.825 ** float(12 - i)) for i in range(13)] if guess_mode else ([strength] * 13)
# Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
samples, intermediates = ddim_sampler.sample(ddim_steps, num_samples,
shape, cond, verbose=False, eta=eta,
unconditional_guidance_scale=scale,
unconditional_conditioning=un_cond)
if config.save_memory:
model.low_vram_shift(is_diffusing=False)
x_samples = model.decode_first_stage(samples)
x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
results = [x_samples[i] for i in range(num_samples)]
return [detected_map] + results
input_image = PIL.Image.open("/home/patrick/controlnet_v1_1/control_v11p_sd15_canny/images/input.png")
input_image = np.asarray(input_image, dtype=np.uint8)
prompt = "red bird"
output = process(
input_image,
prompt,
eta=1.0,
ddim_steps=20,
scale=9.0,
seed=12345,
a_prompt="best quality",
n_prompt="lowres, bad anatomy, bad hands, cropped, worst quality"
)
img_1 = PIL.Image.fromarray(output[0])
img_2 = PIL.Image.fromarray(output[1])
img_1.save("/home/patrick/images/canny_orig_1.png")
img_2.save("/home/patrick/images/canny_orig_2.png")
Now doing more or less the equivalent with `diffusers:
from pytorch_lightning import seed_everything
import torch
import os
from huggingface_hub import HfApi
from annotator.util import resize_image, HWC3
from pathlib import Path
from diffusers.utils import load_image
from diffusers.models.attention_processor import AttnProcessor
import numpy as np
import cv2
from PIL import Image
from diffusers import (
ControlNetModel,
StableDiffusionControlNetPipeline,
DDIMScheduler,
)
checkpoint = "lllyasviel/control_v11p_sd15_canny"
image = load_image(
"https://huggingface.co/lllyasviel/control_v11p_sd15_canny/resolve/main/images/input.png"
)
image = np.array(image)
image = resize_image(image, 512)
low_threshold = 100
high_threshold = 200
image = cv2.Canny(image, low_threshold, high_threshold)
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
control_image = Image.fromarray(image)
controlnet = ControlNetModel.from_pretrained(checkpoint, torch_dtype=torch.float16)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
)
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
pipe.to("cuda")
prompt = "bird, best quality"
seed = 12345
negative_prompt = "lowres, bad anatomy, bad hands, cropped, worst quality"
seed_everything(seed)
image = pipe(prompt, num_inference_steps=20, negative_prompt=negative_prompt, image=control_image, guidance_scale=9.0, eta=1.0).images[0]
control_image.save("/home/patrick/images/canny_diff_1.png")
image.save("/home/patrick/images/canny_diff_2.png")
As one can see there are some differences: The diffusers version has some green feathers while the original one does not. Apart from this, I would argue that the images are extremely similar for such a non-specific prompt. The more specific the prompt, the more similar the pictures would be.
I'm running the examples on PyTorch 2.0 and on a RTX4090 GPU and use "pure" float16 optimization for diffusers
. Also diffusers
makes use of PyTorch's Accelerated Attention while the original code seems to run using "normal" PyTorch attention and float32.