Spaces:

JackAILab
/

ConsistentID

Running on Zero

App Files Files Community

JackAILab commited on May 8

Commit

9aa2c12

•

1 Parent(s): 6e291eb

Update pipline_StableDiffusion_ConsistentID.py

Browse files

Files changed (1) hide show

pipline_StableDiffusion_ConsistentID.py +61 -58

pipline_StableDiffusion_ConsistentID.py CHANGED Viewed

@@ -5,7 +5,8 @@ import numpy as np
 from PIL import Image
 import torch
 from torchvision import transforms
-from insightface.app import FaceAnalysis
 from safetensors import safe_open
 from huggingface_hub.utils import validate_hf_hub_args
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
@@ -15,15 +16,11 @@ from diffusers.utils import _get_model_file
 from functions import process_text_with_markers, masks_for_unique_values, fetch_mask_raw_image, tokenize_and_mask_noun_phrases_ends, prepare_image_token_idx
 from functions import ProjPlusModel, masks_for_unique_values
 from attention import Consistent_IPAttProcessor, Consistent_AttProcessor, FacialEncoder
-# from modelscope.outputs import OutputKeys
-# from modelscope.pipelines import pipeline
-#TODO
-import sys
-sys.path.append("./models/BiSeNet")
-from model import BiSeNet
 PipelineImageInput = Union[
     PIL.Image.Image,
@@ -32,7 +29,7 @@ PipelineImageInput = Union[
     List[torch.FloatTensor],
 ]
 class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
     @validate_hf_hub_args
@@ -43,13 +40,13 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         subfolder: str = '',
         trigger_word_ID: str = '<|image|>',
         trigger_word_facial: str = '<|facial|>',
-        image_encoder_path: str = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K',   # TODO
         torch_dtype = torch.float16,
         num_tokens = 4,
         lora_rank= 128,
         **kwargs,
     ):
-        self.lora_rank = lora_rank
         self.torch_dtype = torch_dtype
         self.num_tokens = num_tokens
         self.set_ip_adapter()
@@ -68,7 +65,7 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         ### BiSeNet
         self.bise_net = BiSeNet(n_classes = 19)
         self.bise_net.cuda()
-        self.bise_net_cp='./models/BiSeNet_pretrained_for_ConsistentID.pth' #TODO
         self.bise_net.load_state_dict(torch.load(self.bise_net_cp))
         self.bise_net.eval()
         # Colors for all 20 parts
@@ -82,8 +79,9 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
                     [255, 0, 255], [255, 85, 255], [255, 170, 255],
                     [0, 255, 255], [85, 255, 255], [170, 255, 255]]
-        ### LLVA Optional
-        self.llva_model_path = "llava-hf/llava-1.5-7b-hf" #TODO
         self.llva_prompt = "Describe this person's facial features for me, including face, ears, eyes, nose, and mouth."
         self.llva_tokenizer, self.llva_model, self.llva_image_processor, self.llva_context_len = None,None,None,None #load_pretrained_model(self.llva_model_path)
@@ -91,12 +89,10 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
             cross_attention_dim=self.unet.config.cross_attention_dim,
             id_embeddings_dim=512,
             clip_embeddings_dim=self.image_encoder.config.hidden_size,
-            num_tokens=self.num_tokens,  # 4
         ).to(self.device, dtype=self.torch_dtype)
         self.FacialEncoder = FacialEncoder(self.image_encoder).to(self.device, dtype=self.torch_dtype)
-        # self.skin_retouching = pipeline('skin-retouching-torch', model='damo/cv_unet_skin_retouching_torch', model_revision='v1.0.2')
         # Load the main state dict first.
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
@@ -189,8 +185,10 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         multi_facial_embeds = torch.stack(hidden_states)
         uncond_multi_facial_embeds = torch.stack(uncond_hidden_states)
         facial_prompt_embeds = self.FacialEncoder(prompt_embeds, multi_facial_embeds, facial_token_masks, valid_facial_token_idx_mask)
         uncond_facial_prompt_embeds = self.FacialEncoder(negative_prompt_embeds, uncond_multi_facial_embeds, facial_token_masks, valid_facial_token_idx_mask)
         return facial_prompt_embeds, uncond_facial_prompt_embeds
@@ -202,9 +200,11 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         clip_image = clip_image.to(self.device, dtype=self.torch_dtype)
         clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
         uncond_clip_image_embeds = self.image_encoder(torch.zeros_like(clip_image), output_hidden_states=True).hidden_states[-2]
         faceid_embeds = faceid_embeds.to(self.device, dtype=self.torch_dtype)
         image_prompt_tokens = self.image_proj_model(faceid_embeds, clip_image_embeds, shortcut=shortcut, scale=s_scale)
         uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(faceid_embeds), uncond_clip_image_embeds, shortcut=shortcut, scale=s_scale)
         return image_prompt_tokens, uncond_image_prompt_embeds
     def set_scale(self, scale):
@@ -220,6 +220,7 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
             faceid_embeds = torch.zeros_like(torch.empty((1, 512)))
         else:
             faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
         return faceid_embeds
     @torch.inference_mode()
@@ -237,13 +238,13 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
             img = to_tensor(image)
             img = torch.unsqueeze(img, 0)
             img = img.float().cuda()
-            out = self.bise_net(img)[0] #1,19,512,512
-            parsing_anno = out.squeeze(0).cpu().numpy().argmax(0)
         im = np.array(image_resize_PIL)
         vis_im = im.copy().astype(np.uint8)
         stride=1
-        vis_parsing_anno = parsing_anno.copy().astype(np.uint8)
         vis_parsing_anno = cv2.resize(vis_parsing_anno, None, fx=stride, fy=stride, interpolation=cv2.INTER_NEAREST)
         vis_parsing_anno_color = np.zeros((vis_parsing_anno.shape[0], vis_parsing_anno.shape[1], 3)) + 255
@@ -253,7 +254,7 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
             index = np.where(vis_parsing_anno == pi)
             vis_parsing_anno_color[index[0], index[1], :] = self.part_colors[pi]
-        vis_parsing_anno_color = vis_parsing_anno_color.astype(np.uint8)
         vis_parsing_anno_color = cv2.addWeighted(cv2.cvtColor(vis_im, cv2.COLOR_RGB2BGR), 0.4, vis_parsing_anno_color, 0.6, 0)
         return vis_parsing_anno_color, vis_parsing_anno
@@ -282,23 +283,20 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         return face_caption
     @torch.inference_mode()
     def get_prepare_facemask(self, input_image_file):
         vis_parsing_anno_color, vis_parsing_anno = self.parsing_face_mask(input_image_file)
         parsing_mask_list = masks_for_unique_values(vis_parsing_anno)
         key_parsing_mask_list = {}
         key_list = ["Face", "Left_Ear", "Right_Ear", "Left_Eye", "Right_Eye", "Nose", "Upper_Lip", "Lower_Lip"]
         processed_keys = set()
         for key, mask_image in parsing_mask_list.items():
             if key in key_list:
                 if "_" in key:
                     prefix = key.split("_")[1]
-                    if prefix in processed_keys:
                         continue
                     else:
                         key_parsing_mask_list[key] = mask_image
@@ -320,6 +318,7 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         device: Optional[torch.device] = None,
     ):
         device = device or self._execution_device
         face_caption_align, key_parsing_mask_list_align = process_text_with_markers(face_caption, key_parsing_mask_list)
         prompt_face = prompt + "Detail:" + face_caption_align
@@ -335,9 +334,11 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         prompt_text_only = prompt_face.replace("<|facial|>", "").replace("<|image|>", "")
         tokenizer = self.tokenizer
         facial_token_id = tokenizer.convert_tokens_to_ids(facial_token)
-        image_token_id = None
         clean_input_id, image_token_mask, facial_token_mask = tokenize_and_mask_noun_phrases_ends(
         prompt_face, image_token_id, facial_token_id, tokenizer)
         image_token_idx, image_token_idx_mask, facial_token_idx, facial_token_idx_mask = prepare_image_token_idx(
             image_token_mask, facial_token_mask, num_id_images, max_num_facials )
@@ -352,6 +353,7 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         clip_image_processor = CLIPImageProcessor()
         num_facial_part = len(key_parsing_mask_list)
         for key in key_parsing_mask_list:
             key_mask=key_parsing_mask_list[key]
             facial_mask.append(transform_mask(key_mask))
@@ -361,6 +363,7 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         padding_ficial_clip_image = torch.zeros_like(torch.zeros([1, 3, 224, 224]))
         padding_ficial_mask = torch.zeros_like(torch.zeros([1, image_size, image_size]))
         if num_facial_part < max_num_facials:
             facial_clip_image += [torch.zeros_like(padding_ficial_clip_image) for _ in range(max_num_facials - num_facial_part) ]
             facial_mask += [ torch.zeros_like(padding_ficial_mask) for _ in range(max_num_facials - num_facial_part)]
@@ -368,7 +371,7 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         facial_clip_image = torch.stack(facial_clip_image, dim=1).squeeze(0)
         facial_mask = torch.stack(facial_mask, dim=0).squeeze(dim=1)
-        return facial_clip_image, facial_mask
     @torch.no_grad()
     def __call__(
@@ -393,12 +396,9 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
         input_id_images: PipelineImageInput = None,
-        reference_id_images: PipelineImageInput =None,
         start_merge_step: int = 0,
         class_tokens_mask: Optional[torch.LongTensor] = None,
         prompt_embeds_text_only: Optional[torch.FloatTensor] = None,
-        retouching: bool=False,
-        need_safetycheck: bool=True,
     ):
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
@@ -424,7 +424,7 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt) #TODO
         else:
             batch_size = prompt_embeds.shape[0]
@@ -432,8 +432,7 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         do_classifier_free_guidance = guidance_scale >= 1.0
         input_image_file = input_id_images[0]
-        faceid_embeds = self.get_prepare_faceid(face_image=input_image_file)
         face_caption = self.get_prepare_llva_caption(input_image_file)
         key_parsing_mask_list, vis_parsing_anno_color = self.get_prepare_facemask(input_image_file)
@@ -445,13 +444,14 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         (
             prompt_text_only,
             clean_input_id,
-            key_parsing_mask_list_align,
-            facial_token_mask,
-            facial_token_idx,
             facial_token_idx_mask,
         ) = self.encode_prompt_with_trigger_word(
             prompt = prompt,
             face_caption = face_caption,
             key_parsing_mask_list=key_parsing_mask_list,
             device=device,
             max_num_facials = 5,
@@ -463,18 +463,20 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         # 4. Encode input prompt without the trigger word for delayed conditioning
         encoder_hidden_states = self.text_encoder(clean_input_id.to(device))[0]
         prompt_embeds = self._encode_prompt(
             prompt_text_only,
             device=device,
             num_images_per_prompt=num_images_per_prompt,
             do_classifier_free_guidance=True,
             negative_prompt=negative_prompt,
-        )
         negative_encoder_hidden_states_text_only = prompt_embeds[0:num_images_per_prompt]
         encoder_hidden_states_text_only = prompt_embeds[num_images_per_prompt:]
         # 5. Prepare the input ID images
-        prompt_tokens_faceid, uncond_prompt_tokens_faceid = self.get_image_embeds(faceid_embeds, face_image=input_image_file, s_scale=0.0, shortcut=True)
         facial_clip_image, facial_mask = self.get_prepare_clip_image(input_image_file, key_parsing_mask_list_align, image_size=512, max_num_facials=5)
         facial_clip_images = facial_clip_image.unsqueeze(0).to(device, dtype=self.torch_dtype)
         facial_token_mask = facial_token_mask.to(device)
@@ -483,11 +485,13 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         cross_attention_kwargs = {}
-        # 6. Get the update text embeddingx
         prompt_embeds_facial, uncond_prompt_embeds_facial = self.get_facial_embeds(encoder_hidden_states, negative_encoder_hidden_states, \
                                                             facial_clip_images, facial_token_mask, facial_token_idx_mask)
         prompt_embeds = torch.cat([prompt_embeds_facial, prompt_tokens_faceid], dim=1)
         negative_prompt_embeds = torch.cat([uncond_prompt_embeds_facial, uncond_prompt_tokens_faceid], dim=1)
         prompt_embeds = self._encode_prompt(
             prompt,
             device,
@@ -519,8 +523,8 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         (
-            null_prompt_embeds,
-            augmented_prompt_embeds,
             text_prompt_embeds,
         ) = prompt_embeds.chunk(3)
@@ -542,6 +546,7 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
                         [null_prompt_embeds, augmented_prompt_embeds], dim=0
                     )
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
@@ -574,25 +579,17 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
         if output_type == "latent":
             image = latents
             has_nsfw_concept = None
-        elif output_type == "pil":
             # 9.1 Post-processing
             image = self.decode_latents(latents)
             # 9.2 Run safety checker
-            if need_safetycheck:
-                image, has_nsfw_concept = self.run_safety_checker(
-                    image, device, prompt_embeds.dtype
-                )
-            else:
-                has_nsfw_concept = None
-            # 9.3 Convert to PIL list
-            image = self.numpy_to_pil(image)
-            # if retouching:
-            #     after_retouching = self.skin_retouching(image[0])
-            #     if OutputKeys.OUTPUT_IMG in after_retouching:
-            #         image = [Image.fromarray(cv2.cvtColor(after_retouching[OutputKeys.OUTPUT_IMG], cv2.COLOR_BGR2RGB))]
         else:
             # 9.1 Post-processing
             image = self.decode_latents(latents)
@@ -602,7 +599,6 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
                 image, device, prompt_embeds.dtype
             )
         # Offload last model to CPU
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.final_offload_hook.offload()
@@ -614,3 +610,10 @@ class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
             images=image, nsfw_content_detected=has_nsfw_concept
         )

 from PIL import Image
 import torch
 from torchvision import transforms
+from insightface.app import FaceAnalysis
+### insight-face installation can be found at https://github.com/deepinsight/insightface
 from safetensors import safe_open
 from huggingface_hub.utils import validate_hf_hub_args
 from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
 from functions import process_text_with_markers, masks_for_unique_values, fetch_mask_raw_image, tokenize_and_mask_noun_phrases_ends, prepare_image_token_idx
 from functions import ProjPlusModel, masks_for_unique_values
 from attention import Consistent_IPAttProcessor, Consistent_AttProcessor, FacialEncoder
+### Model can be imported from https://github.com/zllrunning/face-parsing.PyTorch?tab=readme-ov-file
+### We use the ckpt of 79999_iter.pth: https://drive.google.com/open?id=154JgKpzCPW82qINcVieuPH3fZ2e0P812
+### Thanks for the open source of face-parsing model.
+from models.BiSeNet.model import BiSeNet
 PipelineImageInput = Union[
     PIL.Image.Image,
     List[torch.FloatTensor],
 ]
+### Download the pretrained model from huggingface and put it locally, then place the model in a local directory and specify the directory location.
 class ConsistentIDStableDiffusionPipeline(StableDiffusionPipeline):
     @validate_hf_hub_args
         subfolder: str = '',
         trigger_word_ID: str = '<|image|>',
         trigger_word_facial: str = '<|facial|>',
+        image_encoder_path: str = 'laion/CLIP-ViT-H-14-laion2B-s32B-b79K',
         torch_dtype = torch.float16,
         num_tokens = 4,
         lora_rank= 128,
         **kwargs,
     ):
+        self.lora_rank = lora_rank
         self.torch_dtype = torch_dtype
         self.num_tokens = num_tokens
         self.set_ip_adapter()
         ### BiSeNet
         self.bise_net = BiSeNet(n_classes = 19)
         self.bise_net.cuda()
+        self.bise_net_cp='JackAILab/ConsistentID/face_parsing.pth'
         self.bise_net.load_state_dict(torch.load(self.bise_net_cp))
         self.bise_net.eval()
         # Colors for all 20 parts
                     [255, 0, 255], [255, 85, 255], [255, 170, 255],
                     [0, 255, 255], [85, 255, 255], [170, 255, 255]]
+        ### LLVA (Optional)
+        self.llva_model_path = "liuhaotian/llava-v1.5-13b" # TODO
+        # IMPORTANT! Download the openai/clip-vit-large-patch14-336 model and specify the model path in config.json ("mm_vision_tower": "openai/clip-vit-large-patch14-336").
         self.llva_prompt = "Describe this person's facial features for me, including face, ears, eyes, nose, and mouth."
         self.llva_tokenizer, self.llva_model, self.llva_image_processor, self.llva_context_len = None,None,None,None #load_pretrained_model(self.llva_model_path)
             cross_attention_dim=self.unet.config.cross_attention_dim,
             id_embeddings_dim=512,
             clip_embeddings_dim=self.image_encoder.config.hidden_size,
+            num_tokens=self.num_tokens,  # 4 - inspirsed by IPAdapter and Midjourney
         ).to(self.device, dtype=self.torch_dtype)
         self.FacialEncoder = FacialEncoder(self.image_encoder).to(self.device, dtype=self.torch_dtype)
         # Load the main state dict first.
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
         multi_facial_embeds = torch.stack(hidden_states)
         uncond_multi_facial_embeds = torch.stack(uncond_hidden_states)
+        # condition
         facial_prompt_embeds = self.FacialEncoder(prompt_embeds, multi_facial_embeds, facial_token_masks, valid_facial_token_idx_mask)
+        # uncondition
         uncond_facial_prompt_embeds = self.FacialEncoder(negative_prompt_embeds, uncond_multi_facial_embeds, facial_token_masks, valid_facial_token_idx_mask)
         return facial_prompt_embeds, uncond_facial_prompt_embeds
         clip_image = clip_image.to(self.device, dtype=self.torch_dtype)
         clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
         uncond_clip_image_embeds = self.image_encoder(torch.zeros_like(clip_image), output_hidden_states=True).hidden_states[-2]
         faceid_embeds = faceid_embeds.to(self.device, dtype=self.torch_dtype)
         image_prompt_tokens = self.image_proj_model(faceid_embeds, clip_image_embeds, shortcut=shortcut, scale=s_scale)
         uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(faceid_embeds), uncond_clip_image_embeds, shortcut=shortcut, scale=s_scale)
         return image_prompt_tokens, uncond_image_prompt_embeds
     def set_scale(self, scale):
             faceid_embeds = torch.zeros_like(torch.empty((1, 512)))
         else:
             faceid_embeds = torch.from_numpy(faces[0].normed_embedding).unsqueeze(0)
         return faceid_embeds
     @torch.inference_mode()
             img = to_tensor(image)
             img = torch.unsqueeze(img, 0)
             img = img.float().cuda()
+            out = self.bise_net(img)[0]
+            parsing_anno = out.squeeze(0).cpu().numpy().argmax(0)
         im = np.array(image_resize_PIL)
         vis_im = im.copy().astype(np.uint8)
         stride=1
+        vis_parsing_anno = parsing_anno.copy().astype(np.uint8)
         vis_parsing_anno = cv2.resize(vis_parsing_anno, None, fx=stride, fy=stride, interpolation=cv2.INTER_NEAREST)
         vis_parsing_anno_color = np.zeros((vis_parsing_anno.shape[0], vis_parsing_anno.shape[1], 3)) + 255
             index = np.where(vis_parsing_anno == pi)
             vis_parsing_anno_color[index[0], index[1], :] = self.part_colors[pi]
+        vis_parsing_anno_color = vis_parsing_anno_color.astype(np.uint8)
         vis_parsing_anno_color = cv2.addWeighted(cv2.cvtColor(vis_im, cv2.COLOR_RGB2BGR), 0.4, vis_parsing_anno_color, 0.6, 0)
         return vis_parsing_anno_color, vis_parsing_anno
         return face_caption
     @torch.inference_mode()
     def get_prepare_facemask(self, input_image_file):
         vis_parsing_anno_color, vis_parsing_anno = self.parsing_face_mask(input_image_file)
         parsing_mask_list = masks_for_unique_values(vis_parsing_anno)
         key_parsing_mask_list = {}
         key_list = ["Face", "Left_Ear", "Right_Ear", "Left_Eye", "Right_Eye", "Nose", "Upper_Lip", "Lower_Lip"]
         processed_keys = set()
         for key, mask_image in parsing_mask_list.items():
             if key in key_list:
                 if "_" in key:
                     prefix = key.split("_")[1]
+                    if prefix in processed_keys:
                         continue
                     else:
                         key_parsing_mask_list[key] = mask_image
         device: Optional[torch.device] = None,
     ):
         device = device or self._execution_device
         face_caption_align, key_parsing_mask_list_align = process_text_with_markers(face_caption, key_parsing_mask_list)
         prompt_face = prompt + "Detail:" + face_caption_align
         prompt_text_only = prompt_face.replace("<|facial|>", "").replace("<|image|>", "")
         tokenizer = self.tokenizer
         facial_token_id = tokenizer.convert_tokens_to_ids(facial_token)
+        image_token_id = None
         clean_input_id, image_token_mask, facial_token_mask = tokenize_and_mask_noun_phrases_ends(
         prompt_face, image_token_id, facial_token_id, tokenizer)
         image_token_idx, image_token_idx_mask, facial_token_idx, facial_token_idx_mask = prepare_image_token_idx(
             image_token_mask, facial_token_mask, num_id_images, max_num_facials )
         clip_image_processor = CLIPImageProcessor()
         num_facial_part = len(key_parsing_mask_list)
         for key in key_parsing_mask_list:
             key_mask=key_parsing_mask_list[key]
             facial_mask.append(transform_mask(key_mask))
         padding_ficial_clip_image = torch.zeros_like(torch.zeros([1, 3, 224, 224]))
         padding_ficial_mask = torch.zeros_like(torch.zeros([1, image_size, image_size]))
         if num_facial_part < max_num_facials:
             facial_clip_image += [torch.zeros_like(padding_ficial_clip_image) for _ in range(max_num_facials - num_facial_part) ]
             facial_mask += [ torch.zeros_like(padding_ficial_mask) for _ in range(max_num_facials - num_facial_part)]
         facial_clip_image = torch.stack(facial_clip_image, dim=1).squeeze(0)
         facial_mask = torch.stack(facial_mask, dim=0).squeeze(dim=1)
+        return facial_clip_image, facial_mask
     @torch.no_grad()
     def __call__(
         callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
         callback_steps: int = 1,
         input_id_images: PipelineImageInput = None,
         start_merge_step: int = 0,
         class_tokens_mask: Optional[torch.LongTensor] = None,
         prompt_embeds_text_only: Optional[torch.FloatTensor] = None,
     ):
         # 0. Default height and width to unet
         height = height or self.unet.config.sample_size * self.vae_scale_factor
         if prompt is not None and isinstance(prompt, str):
             batch_size = 1
         elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
         else:
             batch_size = prompt_embeds.shape[0]
         do_classifier_free_guidance = guidance_scale >= 1.0
         input_image_file = input_id_images[0]
+        faceid_embeds = self.get_prepare_faceid(face_image=input_image_file)
         face_caption = self.get_prepare_llva_caption(input_image_file)
         key_parsing_mask_list, vis_parsing_anno_color = self.get_prepare_facemask(input_image_file)
         (
             prompt_text_only,
             clean_input_id,
+            key_parsing_mask_list_align,
+            facial_token_mask,
+            facial_token_idx,
             facial_token_idx_mask,
         ) = self.encode_prompt_with_trigger_word(
             prompt = prompt,
             face_caption = face_caption,
+            # prompt_2=None,
             key_parsing_mask_list=key_parsing_mask_list,
             device=device,
             max_num_facials = 5,
         # 4. Encode input prompt without the trigger word for delayed conditioning
         encoder_hidden_states = self.text_encoder(clean_input_id.to(device))[0]
         prompt_embeds = self._encode_prompt(
             prompt_text_only,
             device=device,
             num_images_per_prompt=num_images_per_prompt,
             do_classifier_free_guidance=True,
             negative_prompt=negative_prompt,
+        )
         negative_encoder_hidden_states_text_only = prompt_embeds[0:num_images_per_prompt]
         encoder_hidden_states_text_only = prompt_embeds[num_images_per_prompt:]
         # 5. Prepare the input ID images
+        prompt_tokens_faceid, uncond_prompt_tokens_faceid = self.get_image_embeds(faceid_embeds, face_image=input_image_file, s_scale=1.0, shortcut=False)
         facial_clip_image, facial_mask = self.get_prepare_clip_image(input_image_file, key_parsing_mask_list_align, image_size=512, max_num_facials=5)
         facial_clip_images = facial_clip_image.unsqueeze(0).to(device, dtype=self.torch_dtype)
         facial_token_mask = facial_token_mask.to(device)
         cross_attention_kwargs = {}
+        # 6. Get the update text embedding
         prompt_embeds_facial, uncond_prompt_embeds_facial = self.get_facial_embeds(encoder_hidden_states, negative_encoder_hidden_states, \
                                                             facial_clip_images, facial_token_mask, facial_token_idx_mask)
         prompt_embeds = torch.cat([prompt_embeds_facial, prompt_tokens_faceid], dim=1)
         negative_prompt_embeds = torch.cat([uncond_prompt_embeds_facial, uncond_prompt_tokens_faceid], dim=1)
         prompt_embeds = self._encode_prompt(
             prompt,
             device,
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
         (
+            null_prompt_embeds,
+            augmented_prompt_embeds,
             text_prompt_embeds,
         ) = prompt_embeds.chunk(3)
                         [null_prompt_embeds, augmented_prompt_embeds], dim=0
                     )
+                # predict the noise residual
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
         if output_type == "latent":
             image = latents
             has_nsfw_concept = None
+        elif output_type == "pil":
             # 9.1 Post-processing
             image = self.decode_latents(latents)
             # 9.2 Run safety checker
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype
+            )
+            # 9.3 Convert to PIL
+            image = self.numpy_to_pil(image)
         else:
             # 9.1 Post-processing
             image = self.decode_latents(latents)
                 image, device, prompt_embeds.dtype
             )
         # Offload last model to CPU
         if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
             self.final_offload_hook.offload()
             images=image, nsfw_content_detected=has_nsfw_concept
         )