Spaces:
Paused
Paused
File size: 2,931 Bytes
5bb6a56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
import torch
from transformers import CLIPVisionModelWithProjection,CLIPImageProcessor
from diffusers.utils import load_image
import os,sys
from kolors.pipelines.pipeline_stable_diffusion_xl_chatglm_256_ipadapter import StableDiffusionXLPipeline
from kolors.models.modeling_chatglm import ChatGLMModel
from kolors.models.tokenization_chatglm import ChatGLMTokenizer
# from diffusers import UNet2DConditionModel, AutoencoderKL
from diffusers import AutoencoderKL
from kolors.models.unet_2d_condition import UNet2DConditionModel
from diffusers import EulerDiscreteScheduler
from PIL import Image
root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
def infer( ip_img_path, prompt ):
ckpt_dir = f'{root_dir}/weights/Kolors'
text_encoder = ChatGLMModel.from_pretrained(
f'{ckpt_dir}/text_encoder',
torch_dtype=torch.float16).half()
tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half()
scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half()
image_encoder = CLIPVisionModelWithProjection.from_pretrained( f'{root_dir}/weights/Kolors-IP-Adapter-Plus/image_encoder', ignore_mismatched_sizes=True).to(dtype=torch.float16)
ip_img_size = 336
clip_image_processor = CLIPImageProcessor( size=ip_img_size, crop_size=ip_img_size )
pipe = StableDiffusionXLPipeline(
vae=vae,
text_encoder=text_encoder,
tokenizer=tokenizer,
unet=unet,
scheduler=scheduler,
image_encoder=image_encoder,
feature_extractor=clip_image_processor,
force_zeros_for_empty_prompt=False
)
pipe = pipe.to("cuda")
pipe.enable_model_cpu_offload()
if hasattr(pipe.unet, 'encoder_hid_proj'):
pipe.unet.text_encoder_hid_proj = pipe.unet.encoder_hid_proj
pipe.load_ip_adapter( f'{root_dir}/weights/Kolors-IP-Adapter-Plus' , subfolder="", weight_name=["ip_adapter_plus_general.bin"])
basename = ip_img_path.rsplit('/',1)[-1].rsplit('.',1)[0]
ip_adapter_img = Image.open( ip_img_path )
generator = torch.Generator(device="cpu").manual_seed(66)
for scale in [0.5]:
pipe.set_ip_adapter_scale([ scale ])
# print(prompt)
image = pipe(
prompt= prompt ,
ip_adapter_image=[ ip_adapter_img ],
negative_prompt="",
height=1024,
width=1024,
num_inference_steps= 50,
guidance_scale=5.0,
num_images_per_prompt=1,
generator=generator,
).images[0]
image.save(f'{root_dir}/scripts/outputs/sample_ip_{basename}.jpg')
if __name__ == '__main__':
import fire
fire.Fire(infer)
|