T2I-Adapter-SDXL / model.py
hysts's picture
hysts HF staff
Update
93b806c
import gc
import os
from abc import ABC, abstractmethod
import numpy as np
import PIL.Image
import torch
from controlnet_aux import (
CannyDetector,
LineartDetector,
MidasDetector,
OpenposeDetector,
PidiNetDetector,
ZoeDetector,
)
from diffusers import (
AutoencoderKL,
EulerAncestralDiscreteScheduler,
StableDiffusionXLAdapterPipeline,
T2IAdapter,
)
SD_XL_BASE_RATIOS = {
"0.5": (704, 1408),
"0.52": (704, 1344),
"0.57": (768, 1344),
"0.6": (768, 1280),
"0.68": (832, 1216),
"0.72": (832, 1152),
"0.78": (896, 1152),
"0.82": (896, 1088),
"0.88": (960, 1088),
"0.94": (960, 1024),
"1.0": (1024, 1024),
"1.07": (1024, 960),
"1.13": (1088, 960),
"1.21": (1088, 896),
"1.29": (1152, 896),
"1.38": (1152, 832),
"1.46": (1216, 832),
"1.67": (1280, 768),
"1.75": (1344, 768),
"1.91": (1344, 704),
"2.0": (1408, 704),
"2.09": (1472, 704),
"2.4": (1536, 640),
"2.5": (1600, 640),
"2.89": (1664, 576),
"3.0": (1728, 576),
}
def find_closest_aspect_ratio(target_width: int, target_height: int) -> str:
target_ratio = target_width / target_height
closest_ratio = ""
min_difference = float("inf")
for ratio_str, (width, height) in SD_XL_BASE_RATIOS.items():
ratio = width / height
difference = abs(target_ratio - ratio)
if difference < min_difference:
min_difference = difference
closest_ratio = ratio_str
return closest_ratio
def resize_to_closest_aspect_ratio(image: PIL.Image.Image) -> PIL.Image.Image:
target_width, target_height = image.size
closest_ratio = find_closest_aspect_ratio(target_width, target_height)
# Get the dimensions from the closest aspect ratio in the dictionary
new_width, new_height = SD_XL_BASE_RATIOS[closest_ratio]
# Resize the image to the new dimensions while preserving the aspect ratio
resized_image = image.resize((new_width, new_height), PIL.Image.LANCZOS)
return resized_image
ADAPTER_REPO_IDS = {
"canny": "TencentARC/t2i-adapter-canny-sdxl-1.0",
"sketch": "TencentARC/t2i-adapter-sketch-sdxl-1.0",
"lineart": "TencentARC/t2i-adapter-lineart-sdxl-1.0",
"depth-midas": "TencentARC/t2i-adapter-depth-midas-sdxl-1.0",
"depth-zoe": "TencentARC/t2i-adapter-depth-zoe-sdxl-1.0",
"openpose": "TencentARC/t2i-adapter-openpose-sdxl-1.0",
# "recolor": "TencentARC/t2i-adapter-recolor-sdxl-1.0",
}
ADAPTER_NAMES = list(ADAPTER_REPO_IDS.keys())
class Preprocessor(ABC):
@abstractmethod
def to(self, device: torch.device | str) -> "Preprocessor":
pass
@abstractmethod
def __call__(self, image: PIL.Image.Image) -> PIL.Image.Image:
pass
class CannyPreprocessor(Preprocessor):
def __init__(self):
self.model = CannyDetector()
def to(self, device: torch.device | str) -> Preprocessor:
return self
def __call__(self, image: PIL.Image.Image) -> PIL.Image.Image:
return self.model(image, detect_resolution=384, image_resolution=1024)
class LineartPreprocessor(Preprocessor):
def __init__(self):
self.model = LineartDetector.from_pretrained("lllyasviel/Annotators")
def to(self, device: torch.device | str) -> Preprocessor:
self.model.to(device)
return self
def __call__(self, image: PIL.Image.Image) -> PIL.Image.Image:
return self.model(image, detect_resolution=384, image_resolution=1024)
class MidasPreprocessor(Preprocessor):
def __init__(self):
self.model = MidasDetector.from_pretrained(
"valhalla/t2iadapter-aux-models", filename="dpt_large_384.pt", model_type="dpt_large"
)
def to(self, device: torch.device | str) -> Preprocessor:
self.model.to(device)
return self
def __call__(self, image: PIL.Image.Image) -> PIL.Image.Image:
return self.model(image, detect_resolution=512, image_resolution=1024)
class OpenposePreprocessor(Preprocessor):
def __init__(self):
self.model = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
def to(self, device: torch.device | str) -> Preprocessor:
self.model.to(device)
return self
def __call__(self, image: PIL.Image.Image) -> PIL.Image.Image:
out = self.model(image, detect_resolution=512, image_resolution=1024)
out = np.array(out)[:, :, ::-1]
out = PIL.Image.fromarray(np.uint8(out))
return out
class PidiNetPreprocessor(Preprocessor):
def __init__(self):
self.model = PidiNetDetector.from_pretrained("lllyasviel/Annotators")
def to(self, device: torch.device | str) -> Preprocessor:
self.model.to(device)
return self
def __call__(self, image: PIL.Image.Image) -> PIL.Image.Image:
return self.model(image, detect_resolution=512, image_resolution=1024, apply_filter=True)
class RecolorPreprocessor(Preprocessor):
def to(self, device: torch.device | str) -> Preprocessor:
return self
def __call__(self, image: PIL.Image.Image) -> PIL.Image.Image:
return image.convert("L").convert("RGB")
class ZoePreprocessor(Preprocessor):
def __init__(self):
self.model = ZoeDetector.from_pretrained(
"valhalla/t2iadapter-aux-models", filename="zoed_nk.pth", model_type="zoedepth_nk"
)
def to(self, device: torch.device | str) -> Preprocessor:
self.model.to(device)
return self
def __call__(self, image: PIL.Image.Image) -> PIL.Image.Image:
return self.model(image, gamma_corrected=True, image_resolution=1024)
PRELOAD_PREPROCESSORS_IN_GPU_MEMORY = os.getenv("PRELOAD_PREPROCESSORS_IN_GPU_MEMORY", "0") == "1"
PRELOAD_PREPROCESSORS_IN_CPU_MEMORY = os.getenv("PRELOAD_PREPROCESSORS_IN_CPU_MEMORY", "0") == "1"
if PRELOAD_PREPROCESSORS_IN_GPU_MEMORY:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
preprocessors_gpu: dict[str, Preprocessor] = {
"canny": CannyPreprocessor().to(device),
"sketch": PidiNetPreprocessor().to(device),
"lineart": LineartPreprocessor().to(device),
"depth-midas": MidasPreprocessor().to(device),
"depth-zoe": ZoePreprocessor().to(device),
"openpose": OpenposePreprocessor().to(device),
"recolor": RecolorPreprocessor().to(device),
}
def get_preprocessor(adapter_name: str) -> Preprocessor:
return preprocessors_gpu[adapter_name]
elif PRELOAD_PREPROCESSORS_IN_CPU_MEMORY:
preprocessors_cpu: dict[str, Preprocessor] = {
"canny": CannyPreprocessor(),
"sketch": PidiNetPreprocessor(),
"lineart": LineartPreprocessor(),
"depth-midas": MidasPreprocessor(),
"depth-zoe": ZoePreprocessor(),
"openpose": OpenposePreprocessor(),
"recolor": RecolorPreprocessor(),
}
def get_preprocessor(adapter_name: str) -> Preprocessor:
return preprocessors_cpu[adapter_name]
else:
def get_preprocessor(adapter_name: str) -> Preprocessor:
if adapter_name == "canny":
return CannyPreprocessor()
elif adapter_name == "sketch":
return PidiNetPreprocessor()
elif adapter_name == "lineart":
return LineartPreprocessor()
elif adapter_name == "depth-midas":
return MidasPreprocessor()
elif adapter_name == "depth-zoe":
return ZoePreprocessor()
elif adapter_name == "openpose":
return OpenposePreprocessor()
elif adapter_name == "recolor":
return RecolorPreprocessor()
else:
raise ValueError(f"Adapter name must be one of {ADAPTER_NAMES}")
def download_all_preprocessors():
for adapter_name in ADAPTER_NAMES:
get_preprocessor(adapter_name)
gc.collect()
download_all_preprocessors()
def download_all_adapters():
for adapter_name in ADAPTER_NAMES:
T2IAdapter.from_pretrained(
ADAPTER_REPO_IDS[adapter_name],
torch_dtype=torch.float16,
varient="fp16",
)
gc.collect()
class Model:
MAX_NUM_INFERENCE_STEPS = 50
def __init__(self, adapter_name: str):
if adapter_name not in ADAPTER_NAMES:
raise ValueError(f"Adapter name must be one of {ADAPTER_NAMES}")
self.preprocessor_name = adapter_name
self.adapter_name = adapter_name
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
self.preprocessor = get_preprocessor(adapter_name).to(self.device)
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
adapter = T2IAdapter.from_pretrained(
ADAPTER_REPO_IDS[adapter_name],
torch_dtype=torch.float16,
varient="fp16",
).to(self.device)
self.pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
model_id,
vae=AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16),
adapter=adapter,
scheduler=EulerAncestralDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler"),
torch_dtype=torch.float16,
variant="fp16",
).to(self.device)
self.pipe.enable_xformers_memory_efficient_attention()
self.pipe.load_lora_weights(
"stabilityai/stable-diffusion-xl-base-1.0", weight_name="sd_xl_offset_example-lora_1.0.safetensors"
)
self.pipe.fuse_lora(lora_scale=0.4)
else:
self.preprocessor = None # type: ignore
self.pipe = None
def change_preprocessor(self, adapter_name: str) -> None:
if adapter_name not in ADAPTER_NAMES:
raise ValueError(f"Adapter name must be one of {ADAPTER_NAMES}")
if adapter_name == self.preprocessor_name:
return
if PRELOAD_PREPROCESSORS_IN_GPU_MEMORY:
pass
elif PRELOAD_PREPROCESSORS_IN_CPU_MEMORY:
self.preprocessor.to("cpu")
else:
del self.preprocessor
self.preprocessor = get_preprocessor(adapter_name).to(self.device)
self.preprocessor_name = adapter_name
gc.collect()
torch.cuda.empty_cache()
def change_adapter(self, adapter_name: str) -> None:
if adapter_name not in ADAPTER_NAMES:
raise ValueError(f"Adapter name must be one of {ADAPTER_NAMES}")
if adapter_name == self.adapter_name:
return
self.pipe.adapter = T2IAdapter.from_pretrained(
ADAPTER_REPO_IDS[adapter_name],
torch_dtype=torch.float16,
varient="fp16",
).to(self.device)
self.adapter_name = adapter_name
gc.collect()
torch.cuda.empty_cache()
def resize_image(self, image: PIL.Image.Image) -> PIL.Image.Image:
w, h = image.size
scale = 1024 / max(w, h)
new_w = int(w * scale)
new_h = int(h * scale)
return image.resize((new_w, new_h), PIL.Image.LANCZOS)
def run(
self,
image: PIL.Image.Image,
prompt: str,
negative_prompt: str,
adapter_name: str,
num_inference_steps: int = 30,
guidance_scale: float = 5.0,
adapter_conditioning_scale: float = 1.0,
adapter_conditioning_factor: float = 1.0,
seed: int = 0,
apply_preprocess: bool = True,
) -> list[PIL.Image.Image]:
if not torch.cuda.is_available():
raise RuntimeError("This demo does not work on CPU.")
if num_inference_steps > self.MAX_NUM_INFERENCE_STEPS:
raise ValueError(f"Number of steps must be less than {self.MAX_NUM_INFERENCE_STEPS}")
# Resize image to avoid OOM
image = self.resize_image(image)
self.change_preprocessor(adapter_name)
self.change_adapter(adapter_name)
if apply_preprocess:
image = self.preprocessor(image)
image = resize_to_closest_aspect_ratio(image)
generator = torch.Generator(device=self.device).manual_seed(seed)
out = self.pipe(
prompt=prompt,
negative_prompt=negative_prompt,
image=image,
num_inference_steps=num_inference_steps,
adapter_conditioning_scale=adapter_conditioning_scale,
adapter_conditioning_factor=adapter_conditioning_factor,
generator=generator,
guidance_scale=guidance_scale,
).images[0]
return [image, out]