Spaces:
Paused
Paused
import cv2 | |
import torch | |
import numpy as np | |
from PIL import Image | |
from diffusers.utils import load_image | |
from diffusers.models import ControlNetModel | |
from insightface.app import FaceAnalysis | |
from pipeline_stable_diffusion_xl_instantid_img2img import StableDiffusionXLInstantIDImg2ImgPipeline, draw_kps | |
def resize_img(input_image, max_side=1280, min_side=1024, size=None, | |
pad_to_max_side=False, mode=Image.BILINEAR, base_pixel_number=64): | |
w, h = input_image.size | |
if size is not None: | |
w_resize_new, h_resize_new = size | |
else: | |
ratio = min_side / min(h, w) | |
w, h = round(ratio*w), round(ratio*h) | |
ratio = max_side / max(h, w) | |
input_image = input_image.resize([round(ratio*w), round(ratio*h)], mode) | |
w_resize_new = (round(ratio * w) // base_pixel_number) * base_pixel_number | |
h_resize_new = (round(ratio * h) // base_pixel_number) * base_pixel_number | |
input_image = input_image.resize([w_resize_new, h_resize_new], mode) | |
if pad_to_max_side: | |
res = np.ones([max_side, max_side, 3], dtype=np.uint8) * 255 | |
offset_x = (max_side - w_resize_new) // 2 | |
offset_y = (max_side - h_resize_new) // 2 | |
res[offset_y:offset_y+h_resize_new, offset_x:offset_x+w_resize_new] = np.array(input_image) | |
input_image = Image.fromarray(res) | |
return input_image | |
if __name__ == "__main__": | |
# Load face encoder | |
app = FaceAnalysis(name='antelopev2', root='./', providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) | |
app.prepare(ctx_id=0, det_size=(640, 640)) | |
# Path to InstantID models | |
face_adapter = f'./checkpoints/ip-adapter.bin' | |
controlnet_path = f'./checkpoints/ControlNetModel' | |
# Load pipeline | |
controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16) | |
base_model_path = 'stabilityai/stable-diffusion-xl-base-1.0' | |
pipe = StableDiffusionXLInstantIDImg2ImgPipeline.from_pretrained( | |
base_model_path, | |
controlnet=controlnet, | |
torch_dtype=torch.float16, | |
) | |
pipe.cuda() | |
pipe.load_ip_adapter_instantid(face_adapter) | |
# Infer setting | |
prompt = "analog film photo of a man. faded film, desaturated, 35mm photo, grainy, vignette, vintage, Kodachrome, Lomography, stained, highly detailed, found footage, masterpiece, best quality" | |
n_prompt = "(lowres, low quality, worst quality:1.2), (text:1.2), watermark, painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured (lowres, low quality, worst quality:1.2), (text:1.2), watermark, painting, drawing, illustration, glitch,deformed, mutated, cross-eyed, ugly, disfigured" | |
face_image = load_image("./examples/yann-lecun_resize.jpg") | |
face_image = resize_img(face_image) | |
face_info = app.get(cv2.cvtColor(np.array(face_image), cv2.COLOR_RGB2BGR)) | |
face_info = sorted(face_info, key=lambda x:(x['bbox'][2]-x['bbox'][0])*(x['bbox'][3]-x['bbox'][1]))[-1] # only use the maximum face | |
face_emb = face_info['embedding'] | |
face_kps = draw_kps(face_image, face_info['kps']) | |
image = pipe( | |
prompt=prompt, | |
negative_prompt=n_prompt, | |
image=face_image, | |
image_embeds=face_emb, | |
control_image=face_kps, | |
controlnet_conditioning_scale=0.8, | |
ip_adapter_scale=0.8, | |
num_inference_steps=30, | |
guidance_scale=5, | |
strength=0.85 | |
).images[0] | |
image.save('result.jpg') |