Spaces:

Westlake-AGI-Lab
/

StyleStudio

Running on Zero

App Files Files Community

Leimingkun commited on 23 days ago

Commit

6fe0b16

•

1 Parent(s): da92c10

stylestudio

Browse files

Files changed (3) hide show

app.py +4 -3
ip_adapter/attention_processor.py +18 -627
ip_adapter/ip_adapter.py +11 -487

app.py CHANGED Viewed

@@ -85,6 +85,7 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     return seed
 @spaces.GPU
 def create_image(
                  style_image_pil,
@@ -95,7 +96,7 @@ def create_image(
                  crossModalAdaIN,
                  use_SAttn,
                  seed,
-                 negative_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
 ):
     style_image = style_image_pil
@@ -109,7 +110,7 @@ def create_image(
     with torch.no_grad():
         images = csgo.generate(pil_style_image=style_image,
                                 prompt=prompt,
-                                negative_prompt=negative_prompt,
                                 height=1024,
                                 width=1024,
                                 guidance_scale=guidance_scale,
@@ -231,7 +232,7 @@ with block:
         inputs=[style_image_pil, target, prompt, guidance_scale, seed, end_fusion],
         fn=run_for_examples,
         outputs=[generated_image],
-        cache_examples=True,
     )
     gr.Markdown(article)

     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     return seed
 @spaces.GPU
 def create_image(
                  style_image_pil,
                  crossModalAdaIN,
                  use_SAttn,
                  seed,
+                 neg_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
 ):
     style_image = style_image_pil
     with torch.no_grad():
         images = csgo.generate(pil_style_image=style_image,
                                 prompt=prompt,
+                                negative_prompt=neg_prompt,
                                 height=1024,
                                 width=1024,
                                 guidance_scale=guidance_scale,
         inputs=[style_image_pil, target, prompt, guidance_scale, seed, end_fusion],
         fn=run_for_examples,
         outputs=[generated_image],
+        cache_examples=False,
     )
     gr.Markdown(article)

ip_adapter/attention_processor.py CHANGED Viewed

@@ -757,441 +757,6 @@ class CNAttnProcessor2_0:
         return hidden_states
-class IP_FuAd_AttnProcessor2_0(torch.nn.Module):
-    r"""
-    Attention processor for IP-Adapater for PyTorch 2.0.
-    Args:
-        hidden_size (`int`):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`):
-            The number of channels in the `encoder_hidden_states`.
-        scale (`float`, defaults to 1.0):
-            the weight scale of image prompt.
-        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
-            The context length of the image features.
-    """
-    def __init__(self, hidden_size, cross_attention_dim=None, content_scale=1.0,style_scale=1.0, num_content_tokens=4,num_style_tokens=4,
-                 skip=False,content=False, style=False, fuAttn=False, fuIPAttn=False, adainIP=False,
-                 fuScale=0, end_fusion=0, attn_name=None):
-        super().__init__()
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.content_scale = content_scale
-        self.style_scale = style_scale
-        self.num_style_tokens = num_style_tokens
-        self.skip = skip
-        self.content = content
-        self.style = style
-        self.fuAttn = fuAttn
-        self.fuIPAttn = fuIPAttn
-        self.adainIP = adainIP
-        self.fuScale = fuScale
-        self.denoise_step = 0
-        self.end_fusion = end_fusion
-        self.name = attn_name
-        if self.content or self.style:
-            self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-            self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-        self.to_k_ip_content =None
-        self.to_v_ip_content =None
-    # def set_content_ipa(self,content_scale=1.0):
-    #     self.to_k_ip_content = nn.Linear(self.cross_attention_dim or self.hidden_size, self.hidden_size, bias=False)
-    #     self.to_v_ip_content = nn.Linear(self.cross_attention_dim or self.hidden_size, self.hidden_size, bias=False)
-    #     self.content_scale=content_scale
-    #     self.content =True
-    def reset_denoise_step(self):
-        if self.denoise_step == 50:
-            self.denoise_step = 0
-            # if "up_blocks.0.attentions.1.transformer_blocks.0.attn2" in self.name:
-            #     print("attn2 reset successful")
-    def __call__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-    ):
-        self.denoise_step += 1
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        else:
-            # get encoder_hidden_states, ip_hidden_states
-            end_pos = encoder_hidden_states.shape[1] -self.num_style_tokens
-            encoder_hidden_states, ip_style_hidden_states = (
-                encoder_hidden_states[:, :end_pos, :],
-                encoder_hidden_states[:, end_pos:, :],
-            )
-            if attn.norm_cross:
-                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        # # modified the attnMap of the Stylization Image
-        if self.fuAttn and self.denoise_step <= self.end_fusion:
-            assert query.shape[0] == 4
-            scale_factor = 1 / math.sqrt(torch.tensor(head_dim, dtype=query.dtype))
-            text_attn_probs = (torch.matmul(query, key.transpose(-2, -1)) * scale_factor).softmax(dim=-1)
-            text_attn_probs[1] = self.fuScale*text_attn_probs[1] + (1-self.fuScale)*text_attn_probs[0]
-            text_attn_probs[3] = self.fuScale*text_attn_probs[3] + (1-self.fuScale)*text_attn_probs[2]
-            hidden_states = torch.matmul(text_attn_probs, value)
-        else:
-            hidden_states = F.scaled_dot_product_attention(
-                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-            )
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-        raw_hidden_states = hidden_states
-        if not self.skip and self.style is True:
-            # for ip-style-adapter
-            ip_style_key = self.to_k_ip(ip_style_hidden_states)
-            ip_style_value = self.to_v_ip(ip_style_hidden_states)
-            ip_style_key = ip_style_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-            ip_style_value = ip_style_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-            # the output of sdp = (batch, num_heads, seq_len, head_dim)
-            # TODO: add support for attn.scale when we move to Torch 2.1
-            if self.fuIPAttn and self.denoise_step <= self.end_fusion:
-                assert query.shape[0] == 4
-                if "down" in self.name:
-                    print("wrong! coding")
-                    exit()
-                scale_factor = 1 / math.sqrt(torch.tensor(head_dim, dtype=query.dtype))
-                ip_attn_probs = torch.matmul(query, ip_style_key.transpose(-2, -1)) * scale_factor
-                ip_attn_probs = F.softmax(ip_attn_probs, dim=-1)
-                ip_attn_probs[1] = self.fuScale*ip_attn_probs[1] + (1-self.fuScale)*ip_attn_probs[0]
-                ip_attn_probs[3] = self.fuScale*ip_attn_probs[3] + (1-self.fuScale)*ip_attn_probs[2]
-                ip_style_hidden_states = torch.matmul(ip_attn_probs, ip_style_value)
-            else:
-                ip_style_hidden_states = F.scaled_dot_product_attention(
-                    query, ip_style_key, ip_style_value, attn_mask=None, dropout_p=0.0, is_causal=False
-                )
-            ip_style_hidden_states = ip_style_hidden_states.transpose(1, 2).reshape(batch_size, -1,
-                                                                                    attn.heads * head_dim)
-            ip_style_hidden_states = ip_style_hidden_states.to(query.dtype)
-            if not self.adainIP:
-                hidden_states = hidden_states + self.style_scale * ip_style_hidden_states
-            else:
-                # print("adain")
-                def adain(content, style):
-                    content_mean = content.mean(dim=1, keepdim=True)
-                    content_std = content.std(dim=1, keepdim=True)
-                    style_mean = style.mean(dim=1, keepdim=True)
-                    style_std = style.std(dim=1, keepdim=True)
-                    normalized_content = (content - content_mean) / content_std
-                    stylized_content = normalized_content * style_std + style_mean
-                    return stylized_content
-                hidden_states = adain(content=hidden_states, style=ip_style_hidden_states)
-        if hidden_states.shape[0] == 4:
-            hidden_states[0] = raw_hidden_states[0]
-            hidden_states[2] = raw_hidden_states[2]
-        # hidden_states = raw_hidden_states
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        self.reset_denoise_step()
-        return hidden_states
-class IP_FuAd_AttnProcessor2_0_exp(torch.nn.Module):
-    r"""
-    Attention processor for IP-Adapater for PyTorch 2.0.
-    Args:
-        hidden_size (`int`):
-            The hidden size of the attention layer.
-        cross_attention_dim (`int`):
-            The number of channels in the `encoder_hidden_states`.
-        scale (`float`, defaults to 1.0):
-            the weight scale of image prompt.
-        num_tokens (`int`, defaults to 4 when do ip_adapter_plus it should be 16):
-            The context length of the image features.
-    """
-    def __init__(self, hidden_size, cross_attention_dim=None, content_scale=1.0,style_scale=1.0, num_content_tokens=4,num_style_tokens=4,
-                 skip=False,content=False, style=False, fuAttn=False, fuIPAttn=False, adainIP=False,
-                 fuScale=0, end_fusion=0, attn_name=None, save_attn_map=False):
-        super().__init__()
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
-        self.content_scale = content_scale
-        self.style_scale = style_scale
-        self.num_style_tokens = num_style_tokens
-        self.skip = skip
-        self.content = content
-        self.style = style
-        self.fuAttn = fuAttn
-        self.fuIPAttn = fuIPAttn
-        self.adainIP = adainIP
-        self.fuScale = fuScale
-        self.denoise_step = 0
-        self.end_fusion = end_fusion
-        self.name = attn_name
-        self.save_attn_map = save_attn_map
-        if self.content or self.style:
-            self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-            self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-        self.to_k_ip_content =None
-        self.to_v_ip_content =None
-    # def set_content_ipa(self,content_scale=1.0):
-    #     self.to_k_ip_content = nn.Linear(self.cross_attention_dim or self.hidden_size, self.hidden_size, bias=False)
-    #     self.to_v_ip_content = nn.Linear(self.cross_attention_dim or self.hidden_size, self.hidden_size, bias=False)
-    #     self.content_scale=content_scale
-    #     self.content =True
-    def reset_denoise_step(self):
-        if self.denoise_step == 50:
-            self.denoise_step = 0
-            # if "up_blocks.0.attentions.1.transformer_blocks.0.attn2" in self.name:
-            #     print("attn2 reset successful")
-    def __call__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-    ):
-        self.denoise_step += 1
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        else:
-            # get encoder_hidden_states, ip_hidden_states
-            end_pos = encoder_hidden_states.shape[1] - self.num_content_tokens-self.num_style_tokens
-            encoder_hidden_states, ip_style_hidden_states = (
-                encoder_hidden_states[:, :end_pos, :],
-                encoder_hidden_states[:, end_pos:, :],
-            )
-            if attn.norm_cross:
-                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        ## attention map
-        if self.save_attn_map:
-            attention_probs = attn.get_attention_scores(attn.head_to_batch_dim(query), attn.head_to_batch_dim(value), attention_mask)
-            if attention_probs is not None:
-                if not hasattr(attn, "attn_map"):
-                    setattr(attn, "attn_map", {})
-                    setattr(attn, "inference_step", 0)
-                else:
-                    attn.inference_step += 1
-                # # maybe we need to save all the timestep
-                # if attn.inference_step in self.attn_map_save_steps:
-                attn.attn_map[attn.inference_step] = attention_probs.clone().cpu().detach()
-                # attn.attn_map[attn.inference_step] = attention_probs.detach()
-                ## end of attention map
-            else:
-                print(f"{attn} didn't get the attention probs")
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        # # modified the attnMap of the Stylization Image
-        if self.fuAttn and self.denoise_step <= self.end_fusion:
-            assert query.shape[0] == 4
-            scale_factor = 1 / math.sqrt(torch.tensor(head_dim, dtype=query.dtype))
-            text_attn_probs = (torch.matmul(query, key.transpose(-2, -1)) * scale_factor).softmax(dim=-1)
-            text_attn_probs[1] = self.fuScale*text_attn_probs[1] + (1-self.fuScale)*text_attn_probs[0]
-            text_attn_probs[3] = self.fuScale*text_attn_probs[3] + (1-self.fuScale)*text_attn_probs[2]
-            hidden_states = torch.matmul(text_attn_probs, value)
-        else:
-            hidden_states = F.scaled_dot_product_attention(
-                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-            )
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-        raw_hidden_states = hidden_states
-        if not self.skip and self.style is True:
-            # for ip-style-adapter
-            ip_style_key = self.to_k_ip(ip_style_hidden_states)
-            ip_style_value = self.to_v_ip(ip_style_hidden_states)
-            ip_style_key = ip_style_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-            ip_style_value = ip_style_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-            # the output of sdp = (batch, num_heads, seq_len, head_dim)
-            # TODO: add support for attn.scale when we move to Torch 2.1
-            if self.fuIPAttn and self.denoise_step <= self.end_fusion:
-                assert query.shape[0] == 4
-                if "down" in self.name:
-                    print("wrong! coding")
-                    exit()
-                scale_factor = 1 / math.sqrt(torch.tensor(head_dim, dtype=query.dtype))
-                ip_attn_probs = torch.matmul(query, ip_style_key.transpose(-2, -1)) * scale_factor
-                ip_attn_probs = F.softmax(ip_attn_probs, dim=-1)
-                ip_attn_probs[1] = self.fuScale*ip_attn_probs[1] + (1-self.fuScale)*ip_attn_probs[0]
-                ip_attn_probs[3] = self.fuScale*ip_attn_probs[3] + (1-self.fuScale)*ip_attn_probs[2]
-                ip_style_hidden_states = torch.matmul(ip_attn_probs, ip_style_value)
-            else:
-                ip_style_hidden_states = F.scaled_dot_product_attention(
-                    query, ip_style_key, ip_style_value, attn_mask=None, dropout_p=0.0, is_causal=False
-                )
-            ip_style_hidden_states = ip_style_hidden_states.transpose(1, 2).reshape(batch_size, -1,
-                                                                                    attn.heads * head_dim)
-            ip_style_hidden_states = ip_style_hidden_states.to(query.dtype)
-            # if self.adainIP and self.denoise_step >= self.start_adain:
-            if self.adainIP:
-                # print("adain")
-                # if self.denoise_step == 1 and "up_blocks.1.attentions.2.transformer_blocks.1" in self.name:
-                    # print("adain")
-                def adain(content, style):
-                    content_mean = content.mean(dim=1, keepdim=True)
-                    content_std = content.std(dim=1, keepdim=True)
-                    print("exp code")
-                    pdb.set_trace()
-                    style_mean = style.mean(dim=1, keepdim=True)
-                    style_std = style.std(dim=1, keepdim=True)
-                    normalized_content = (content - content_mean) / content_std
-                    stylized_content = normalized_content * style_std + style_mean
-                    return stylized_content
-                pdb.set_trace()
-                hidden_states = adain(content=hidden_states, style=ip_style_hidden_states)
-            else:
-                hidden_states = hidden_states + self.style_scale * ip_style_hidden_states
-        if hidden_states.shape[0] == 4:
-            hidden_states[0] = raw_hidden_states[0]
-            hidden_states[2] = raw_hidden_states[2]
-        # hidden_states = raw_hidden_states
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        self.reset_denoise_step()
-        return hidden_states
 class AttnProcessor2_0_hijack(torch.nn.Module):
     r"""
     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
@@ -1204,131 +769,8 @@ class AttnProcessor2_0_hijack(torch.nn.Module):
         save_in_unet='down',
         atten_control=None,
         fuSAttn=False,
-        fuScale=0,
-        end_fusion=0,
-        attn_name=None,
-    ):
-        super().__init__()
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
-        self.atten_control = atten_control
-        self.save_in_unet = save_in_unet
-        self.fuSAttn = fuSAttn
-        self.fuScale = fuScale
-        self.denoise_step = 0
-        self.end_fusion = end_fusion
-        self.name = attn_name
-    def reset_denoise_step(self):
-        if self.denoise_step == 50:
-            self.denoise_step = 0
-            # if "up_blocks.0.attentions.1.transformer_blocks.0.attn1" in self.name:
-            #     print("attn1 reset successful")
-    def __call__(
-        self,
-        attn,
-        hidden_states,
-        encoder_hidden_states=None,
-        attention_mask=None,
-        temb=None,
-    ):
-        self.denoise_step += 1
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        if self.fuSAttn and self.denoise_step <= self.end_fusion:
-            assert query.shape[0] == 4
-            if "up_blocks.1.attentions.2.transformer_blocks.1" in self.name and self.denoise_step == self.end_fusion:
-                print("now: ", self.denoise_step, "end now:", self.end_fusion, "scale: ", self.fuScale)
-                # pdb.set_trace()
-            scale_factor = 1 / math.sqrt(torch.tensor(head_dim, dtype=query.dtype))
-            attn_probs = (torch.matmul(query, key.transpose(-2, -1)) * scale_factor).softmax(dim=-1)
-            attn_probs[1] = self.fuScale*attn_probs[1] + (1-self.fuScale)*attn_probs[0]
-            attn_probs[3] = self.fuScale*attn_probs[3] + (1-self.fuScale)*attn_probs[2]
-            hidden_states = torch.matmul(attn_probs, value)
-        else:
-            hidden_states = F.scaled_dot_product_attention(
-                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-            )
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        if self.denoise_step == 50:
-            self.reset_denoise_step()
-        return hidden_states
-class AttnProcessor2_0_exp(torch.nn.Module):
-    r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
-    """
-    def __init__(
-        self,
-        hidden_size=None,
-        cross_attention_dim=None,
-        save_in_unet='down',
-        atten_control=None,
-        fuSAttn=False,
-        fuScale=0,
         end_fusion=0,
-        attn_name=None,
     ):
         super().__init__()
         if not hasattr(F, "scaled_dot_product_attention"):
@@ -1337,16 +779,10 @@ class AttnProcessor2_0_exp(torch.nn.Module):
         self.save_in_unet = save_in_unet
         self.fuSAttn = fuSAttn
-        self.fuScale = fuScale
         self.denoise_step = 0
         self.end_fusion = end_fusion
-        self.name = attn_name
-    def reset_denoise_step(self):
-        if self.denoise_step == 50:
-            self.denoise_step = 0
-            # if "up_blocks.0.attentions.1.transformer_blocks.0.attn1" in self.name:
-            #     print("attn1 reset successful")
     def __call__(
         self,
@@ -1403,26 +839,10 @@ class AttnProcessor2_0_exp(torch.nn.Module):
         # TODO: add support for attn.scale when we move to Torch 2.1
         if self.fuSAttn and self.denoise_step <= self.end_fusion:
             assert query.shape[0] == 4
-            if "up_blocks.1.attentions.2.transformer_blocks.1" in self.name and self.denoise_step == self.end_fusion:
-                print("now: ", self.denoise_step, "end now:", self.end_fusion, "scale: ", self.fuScale)
-                # pdb.set_trace()
             scale_factor = 1 / math.sqrt(torch.tensor(head_dim, dtype=query.dtype))
             attn_probs = (torch.matmul(query, key.transpose(-2, -1)) * scale_factor).softmax(dim=-1)
-            attn_probs[1] = self.fuScale*attn_probs[1] + (1-self.fuScale)*attn_probs[0]
-            attn_probs[3] = self.fuScale*attn_probs[3] + (1-self.fuScale)*attn_probs[2]
-            print("exp code")
-            pdb.set_trace()
-            def adain(content, style):
-                content_mean = content.mean(dim=1, keepdim=True)
-                content_std = content.std(dim=1, keepdim=True)
-                style_mean = style.mean(dim=1, keepdim=True)
-                style_std = style.std(dim=1, keepdim=True)
-                normalized_content = (content - content_mean) / content_std
-                stylized_content = normalized_content * style_std + style_mean
-                return stylized_content
-            value[1] = adain(content=value[0], style=value[1])
-            value[3] = adain(content=value[2], style=value[3])
             hidden_states = torch.matmul(attn_probs, value)
         else:
             hidden_states = F.scaled_dot_product_attention(
@@ -1445,7 +865,8 @@ class AttnProcessor2_0_exp(torch.nn.Module):
         hidden_states = hidden_states / attn.rescale_output_factor
-        self.reset_denoise_step()
         return hidden_states
 class IPAttnProcessor2_0_cross_modal(torch.nn.Module):
@@ -1463,7 +884,7 @@ class IPAttnProcessor2_0_cross_modal(torch.nn.Module):
     """
     def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4, skip=False,
-                 fuAttn=False, fuIPAttn=False, adainIP=False, end_fusion=0, fuScale=0, attn_name=None):
         super().__init__()
         if not hasattr(F, "scaled_dot_product_attention"):
@@ -1478,19 +899,12 @@ class IPAttnProcessor2_0_cross_modal(torch.nn.Module):
         self.fuAttn = fuAttn
         self.fuIPAttn = fuIPAttn
         self.adainIP = adainIP
-        self.denoise_step = fuScale
         self.end_fusion = end_fusion
-        self.fuScale = fuScale
-        self.name = attn_name
         self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
         self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
-    def reset_denoise_step(self):
-        if self.denoise_step == 50:
-            self.denoise_step = 0
-            # if "up_blocks.0.attentions.1.transformer_blocks.0.attn2" in self.name:
-            #     print("attn2 reset successful")
     def __call__(
         self,
@@ -1552,20 +966,10 @@ class IPAttnProcessor2_0_cross_modal(torch.nn.Module):
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1
-        if self.fuAttn and self.denoise_step <= self.end_fusion:
-            assert query.shape[0] == 4
-            if "up_blocks.1.attentions.2.transformer_blocks.1" in self.name and self.denoise_step == self.end_fusion:
-                print("fuAttn")
-                print("now: ", self.denoise_step, "end now:", self.end_fusion, "scale: ", self.fuScale)
-            scale_factor = 1 / math.sqrt(torch.tensor(head_dim, dtype=query.dtype))
-            text_attn_probs = (torch.matmul(query, key.transpose(-2, -1)) * scale_factor).softmax(dim=-1)
-            text_attn_probs[1] = self.fuScale*text_attn_probs[1] + (1-self.fuScale)*text_attn_probs[0]
-            text_attn_probs[3] = self.fuScale*text_attn_probs[3] + (1-self.fuScale)*text_attn_probs[2]
-            hidden_states = torch.matmul(text_attn_probs, value)
-        else:
-            hidden_states = F.scaled_dot_product_attention(
-                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-            )
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.to(query.dtype)
@@ -1582,22 +986,9 @@ class IPAttnProcessor2_0_cross_modal(torch.nn.Module):
             # the output of sdp = (batch, num_heads, seq_len, head_dim)
             # TODO: add support for attn.scale when we move to Torch 2.1
-            if self.fuIPAttn and self.denoise_step <= self.end_fusion:
-                assert query.shape[0] == 4
-                print("fuIPAttn")
-                if "down" in self.name:
-                    print("wrong! coding")
-                    exit()
-                scale_factor = 1 / math.sqrt(torch.tensor(head_dim, dtype=query.dtype))
-                ip_attn_probs = torch.matmul(query, ip_key.transpose(-2, -1)) * scale_factor
-                ip_attn_probs = F.softmax(ip_attn_probs, dim=-1)
-                ip_attn_probs[1] = self.fuScale*ip_attn_probs[1] + (1-self.fuScale)*ip_attn_probs[0]
-                ip_attn_probs[3] = self.fuScale*ip_attn_probs[3] + (1-self.fuScale)*ip_attn_probs[2]
-                ip_hidden_states = torch.matmul(ip_attn_probs, ip_value)
-            else:
-                ip_hidden_states = F.scaled_dot_product_attention(
-                    query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
-                )
             with torch.no_grad():
                 self.attn_map = query @ ip_key.transpose(-2, -1).softmax(dim=-1)
@@ -1639,7 +1030,7 @@ class IPAttnProcessor2_0_cross_modal(torch.nn.Module):
         hidden_states = hidden_states / attn.rescale_output_factor
-        if self.denoise_step == 50:
-            self.reset_denoise_step()
         return hidden_states

         return hidden_states
 class AttnProcessor2_0_hijack(torch.nn.Module):
     r"""
     Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0).
         save_in_unet='down',
         atten_control=None,
         fuSAttn=False,
         end_fusion=0,
+        num_inference_step=50,
     ):
         super().__init__()
         if not hasattr(F, "scaled_dot_product_attention"):
         self.save_in_unet = save_in_unet
         self.fuSAttn = fuSAttn
         self.denoise_step = 0
         self.end_fusion = end_fusion
+        self.num_inference_step=num_inference_step
     def __call__(
         self,
         # TODO: add support for attn.scale when we move to Torch 2.1
         if self.fuSAttn and self.denoise_step <= self.end_fusion:
             assert query.shape[0] == 4
             scale_factor = 1 / math.sqrt(torch.tensor(head_dim, dtype=query.dtype))
             attn_probs = (torch.matmul(query, key.transpose(-2, -1)) * scale_factor).softmax(dim=-1)
+            attn_probs[1] = attn_probs[0]
+            attn_probs[3] = attn_probs[2]
             hidden_states = torch.matmul(attn_probs, value)
         else:
             hidden_states = F.scaled_dot_product_attention(
         hidden_states = hidden_states / attn.rescale_output_factor
+        if self.denoise_step == self.num_inference_step:
+            self.denoise_step == 0
         return hidden_states
 class IPAttnProcessor2_0_cross_modal(torch.nn.Module):
     """
     def __init__(self, hidden_size, cross_attention_dim=None, scale=1.0, num_tokens=4, skip=False,
+                 fuAttn=False, fuIPAttn=False, adainIP=False, end_fusion=0, num_inference_step=50):
         super().__init__()
         if not hasattr(F, "scaled_dot_product_attention"):
         self.fuAttn = fuAttn
         self.fuIPAttn = fuIPAttn
         self.adainIP = adainIP
+        self.denoise_step = 0
         self.end_fusion = end_fusion
+        self.num_inference_step = num_inference_step
         self.to_k_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
         self.to_v_ip = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False)
     def __call__(
         self,
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
         hidden_states = hidden_states.to(query.dtype)
             # the output of sdp = (batch, num_heads, seq_len, head_dim)
             # TODO: add support for attn.scale when we move to Torch 2.1
+            ip_hidden_states = F.scaled_dot_product_attention(
+                query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+            )
             with torch.no_grad():
                 self.attn_map = query @ ip_key.transpose(-2, -1).softmax(dim=-1)
         hidden_states = hidden_states / attn.rescale_output_factor
+        if self.denoise_step == self.num_inference_step:
+            self.denoise_step == 0
         return hidden_states

ip_adapter/ip_adapter.py CHANGED Viewed

@@ -22,8 +22,6 @@ if is_torch2_available():
         IPAttnProcessor2_0 as IPAttnProcessor,
     )
     from .attention_processor import IP_CS_AttnProcessor2_0 as IP_CS_AttnProcessor
-    from .attention_processor import IP_FuAd_AttnProcessor2_0 as IP_FuAd_AttnProcessor
-    from .attention_processor import IP_FuAd_AttnProcessor2_0_exp as IP_FuAd_AttnProcessor_exp
     from .attention_processor import AttnProcessor2_0_exp as AttnProcessor_exp
     from .attention_processor import AttnProcessor2_0_hijack as AttnProcessor_hijack
     from .attention_processor import IPAttnProcessor2_0_cross_modal as IPAttnProcessor_cross_modal
@@ -949,7 +947,7 @@ class StyleStudio_Adapter(CSGO):
                     if block_name in name:
                         selected = True
                         # print(name)
-                        attn_procs[name] = IP_FuAd_AttnProcessor(
                             hidden_size=hidden_size,
                             cross_attention_dim=cross_attention_dim,
                             style_scale=1.0,
@@ -963,7 +961,7 @@ class StyleStudio_Adapter(CSGO):
                             attn_name=name,
                         )
                 if selected is False:
-                    attn_procs[name] = IP_FuAd_AttnProcessor(
                         hidden_size=hidden_size,
                         cross_attention_dim=cross_attention_dim,
                         num_style_tokens=self.num_style_tokens,
@@ -1011,7 +1009,7 @@ class StyleStudio_Adapter(CSGO):
     def set_scale(self, style_scale):
         for attn_processor in self.pipe.unet.attn_processors.values():
-            if isinstance(attn_processor, IP_FuAd_AttnProcessor):
                 if attn_processor.style is True:
                     attn_processor.style_scale = style_scale
                     # print('style_scale:',style_scale)
@@ -1100,9 +1098,14 @@ class StyleStudio_Adapter(CSGO):
             if isinstance(attn_processor, AttnProcessor_hijack):
                 attn_processor.fuSAttn = use_SAttn
     def set_adain(self, use_CMA):
         for attn_processor in self.pipe.unet.attn_processors.values():
-            if isinstance(attn_processor, IP_FuAd_AttnProcessor):
                 attn_processor.adainIP = use_CMA
     def generate(
@@ -1125,6 +1128,7 @@ class StyleStudio_Adapter(CSGO):
         self.set_endFusion(end_T = end_fusion)
         self.set_adain(use_CMA=cross_modal_adain)
         self.set_SAttn(use_SAttn=use_SAttn)
         # self.set_scale(style_scale=style_scale)
         num_prompts = 1 if isinstance(pil_style_image, Image.Image) else len(pil_style_image)
@@ -1188,93 +1192,6 @@ class StyleStudio_Adapter(CSGO):
         ).images
         return images
-# StyleStudio_Adapter experiment code
-class StyleStudio_Adapter_exp(StyleStudio_Adapter):
-    def set_ip_adapter(self):
-        unet = self.pipe.unet
-        attn_procs = {}
-        for name in unet.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = unet.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = unet.config.block_out_channels[block_id]
-            if cross_attention_dim is None:
-                attn_procs[name] = AttnProcessor_exp(
-                                        fuSAttn=self.fuSAttn,
-                                        fuScale=self.fuScale,
-                                        end_fusion=self.end_fusion,
-                                        attn_name=name)
-            else:
-                # layername_id += 1
-                selected = False
-                for block_name in self.style_target_blocks:
-                    if block_name in name:
-                        selected = True
-                        # print(name)
-                        #  将所有的StyleBlock中的都改为FuAdAttn
-                        attn_procs[name] = IP_FuAd_AttnProcessor_exp(
-                            hidden_size=hidden_size,
-                            cross_attention_dim=cross_attention_dim,
-                            style_scale=1.0,
-                            style=True,
-                            num_content_tokens=self.num_content_tokens,
-                            num_style_tokens=self.num_style_tokens,
-                            fuAttn=self.fuAttn,
-                            fuIPAttn=self.fuIPAttn,
-                            adainIP=self.adainIP,
-                            fuScale=self.fuScale,
-                            end_fusion=self.end_fusion,
-                            attn_name=name,
-                            save_attn_map=self.save_attn_map,
-                        )
-                # 没有CSGO中关于Content Control的需求 因此就将这个处理Content tokens Cross Attention 删除
-                # 并且这里应该是CSGO代码中 有问题的部分 不论如何这里都会被之后的重置
-                # 并且在CSGO的设计里Content Block和Style Block是没有子集的
-                # selected False表明不是Style Block 关键是 Skip = True
-                if selected is False:
-                    attn_procs[name] = IP_FuAd_AttnProcessor_exp(
-                        hidden_size=hidden_size,
-                        cross_attention_dim=cross_attention_dim,
-                        num_content_tokens=self.num_content_tokens,
-                        num_style_tokens=self.num_style_tokens,
-                        skip=True,
-                        fuAttn=self.fuAttn,
-                        fuIPAttn=self.fuIPAttn,
-                        adainIP=self.adainIP,
-                        fuScale=self.fuScale,
-                        end_fusion=self.end_fusion,
-                        attn_name=name,
-                        save_attn_map=self.save_attn_map,
-                    )
-                    # attn_procs[name] = IP_FuAd_AttnProcessor_exp(
-                    #     hidden_size=hidden_size,
-                    #     cross_attention_dim=cross_attention_dim,
-                    #     num_content_tokens=self.num_content_tokens,
-                    #     num_style_tokens=self.num_style_tokens,
-                    #     skip=True,
-                    #     fuAttn=self.fuAttn,
-                    #     fuIPAttn=self.fuIPAttn,
-                    # )
-                attn_procs[name].to(self.device, dtype=torch.float16)
-        unet.set_attn_processor(attn_procs)
-        if hasattr(self.pipe, "controlnet"):
-            if self.controlnet_adapter is False:
-                if isinstance(self.pipe.controlnet, MultiControlNetModel):
-                    for controlnet in self.pipe.controlnet.nets:
-                        controlnet.set_attn_processor(CNAttnProcessor(
-                            num_tokens=self.num_content_tokens + self.num_style_tokens))
-                else:
-                    self.pipe.controlnet.set_attn_processor(CNAttnProcessor(
-                        num_tokens=self.num_content_tokens + self.num_style_tokens))
-            # 因为我们的代码中没有controlnet需要将Style 注入 这并不是一个I2I的任务
-            # 因此我们将原本CSGO中和ControlNet中注入Style的部分给删除了
 class IPAdapterXL(IPAdapter):
     """SDXL"""
@@ -1361,397 +1278,4 @@ class IPAdapterXL(IPAdapter):
             **kwargs,
         ).images
-        return images
-class IPAdapterXL_cross_modal(IPAdapterXL):
-    def __init__(self, sd_pipe, image_encoder_path, ip_ckpt, device, num_tokens=4,
-                 target_blocks=["block"],
-                 fuAttn=False,
-                 fuSAttn=False,
-                 fuIPAttn=False,
-                 fuScale=0,
-                 adainIP=False,
-                 end_fusion=0,
-                 save_attn_map=False,):
-        self.fuAttn = fuAttn
-        self.fuSAttn = fuSAttn
-        self.fuIPAttn = fuIPAttn
-        self.adainIP = adainIP
-        self.fuScale = fuScale
-        if self.fuSAttn:
-            print(f"hijack Self AttnMap in {end_fusion} steps", "fuScale is: ", fuScale)
-        if self.fuAttn:
-            print(f"hijack Cross AttnMap in {end_fusion} steps", "fuScale is: ", fuScale)
-        if self.fuIPAttn:
-            print(f"hijack IP AttnMap in {end_fusion} steps", "fuScale is: ", fuScale)
-        self.end_fusion = end_fusion
-        self.save_attn_map = save_attn_map
-        self.device = device
-        self.image_encoder_path = image_encoder_path
-        self.ip_ckpt = ip_ckpt
-        self.num_tokens = num_tokens
-        self.target_blocks = target_blocks
-        self.pipe = sd_pipe.to(self.device)
-        self.set_ip_adapter()
-        # load image encoder
-        self.image_encoder = CLIPVisionModelWithProjection.from_pretrained(self.image_encoder_path).to(
-            self.device, dtype=torch.float16
-        )
-        self.clip_image_processor = CLIPImageProcessor()
-        # image proj model
-        self.image_proj_model = self.init_proj()
-        self.load_ip_adapter()
-    def init_proj(self):
-        image_proj_model = ImageProjModel(
-            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
-            clip_embeddings_dim=self.image_encoder.config.projection_dim,
-            clip_extra_context_tokens=self.num_tokens,
-        ).to(self.device, dtype=torch.float16)
-        return image_proj_model
-    def set_ip_adapter(self):
-        unet = self.pipe.unet
-        attn_procs = {}
-        for name in unet.attn_processors.keys():
-            cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
-            if name.startswith("mid_block"):
-                hidden_size = unet.config.block_out_channels[-1]
-            elif name.startswith("up_blocks"):
-                block_id = int(name[len("up_blocks.")])
-                hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
-            elif name.startswith("down_blocks"):
-                block_id = int(name[len("down_blocks.")])
-                hidden_size = unet.config.block_out_channels[block_id]
-            if cross_attention_dim is None:
-                attn_procs[name] = AttnProcessor_hijack(
-                                        fuSAttn=self.fuSAttn,
-                                        fuScale=self.fuScale,
-                                        end_fusion=self.end_fusion,
-                                        attn_name=name) # Self Attention
-            else: # Cross Attention
-                selected = False
-                for block_name in self.target_blocks:
-                    if block_name in name:
-                        selected = True
-                        break
-                if selected:
-                    attn_procs[name] = IPAttnProcessor_cross_modal(
-                        hidden_size=hidden_size,
-                        cross_attention_dim=cross_attention_dim,
-                        scale=1.0,
-                        num_tokens=self.num_tokens,
-                        fuAttn=self.fuAttn,
-                        fuIPAttn=self.fuIPAttn,
-                        adainIP=self.adainIP,
-                        fuScale=self.fuScale,
-                        end_fusion=self.end_fusion,
-                        attn_name=name,
-                    ).to(self.device, dtype=torch.float16)
-                else:
-                    attn_procs[name] = IPAttnProcessor_cross_modal(
-                        hidden_size=hidden_size,
-                        cross_attention_dim=cross_attention_dim,
-                        scale=1.0,
-                        num_tokens=self.num_tokens,
-                        skip=True,
-                        fuAttn=self.fuAttn,
-                        fuIPAttn=self.fuIPAttn,
-                        adainIP=self.adainIP,
-                        fuScale=self.fuScale,
-                        end_fusion=self.end_fusion,
-                        attn_name=name,
-                    ).to(self.device, dtype=torch.float16)
-        unet.set_attn_processor(attn_procs)
-        if hasattr(self.pipe, "controlnet"):
-            if isinstance(self.pipe.controlnet, MultiControlNetModel):
-                for controlnet in self.pipe.controlnet.nets:
-                    controlnet.set_attn_processor(CNAttnProcessor(num_tokens=self.num_tokens))
-            else:
-                self.pipe.controlnet.set_attn_processor(CNAttnProcessor(num_tokens=self.num_tokens))
-    def load_ip_adapter(self):
-        if os.path.splitext(self.ip_ckpt)[-1] == ".safetensors":
-            state_dict = {"image_proj": {}, "ip_adapter": {}}
-            with safe_open(self.ip_ckpt, framework="pt", device="cpu") as f:
-                for key in f.keys():
-                    if key.startswith("image_proj."):
-                        state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
-                    elif key.startswith("ip_adapter."):
-                        state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
-        else:
-            state_dict = torch.load(self.ip_ckpt, map_location="cpu")
-        self.image_proj_model.load_state_dict(state_dict["image_proj"])
-        ip_layers = torch.nn.ModuleList(self.pipe.unet.attn_processors.values())
-        ip_layers.load_state_dict(state_dict["ip_adapter"], strict=False)
-    @torch.inference_mode()
-    def get_image_embeds(self, pil_image=None, clip_image_embeds=None, content_prompt_embeds=None):
-        if pil_image is not None:
-            if isinstance(pil_image, Image.Image):
-                pil_image = [pil_image]
-            clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
-            clip_image_embeds = self.image_encoder(clip_image.to(self.device, dtype=torch.float16)).image_embeds
-        else:
-            clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.float16)
-        if content_prompt_embeds is not None:
-            clip_image_embeds = clip_image_embeds - content_prompt_embeds
-        image_prompt_embeds = self.image_proj_model(clip_image_embeds)
-        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(clip_image_embeds))
-        return image_prompt_embeds, uncond_image_prompt_embeds
-    def set_scale(self, scale):
-        for attn_processor in self.pipe.unet.attn_processors.values():
-            if isinstance(attn_processor, IPAttnProcessor_cross_modal):
-                attn_processor.scale = scale
-    @torch.inference_mode()
-    def get_neg_image_embeds(self, pil_image=None, clip_image_embeds=None, content_prompt_embeds=None):
-        if pil_image is not None:
-            if isinstance(pil_image, Image.Image):
-                pil_image = [pil_image]
-            clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
-            clip_image_embeds = self.image_encoder(clip_image.to(self.device, dtype=torch.float16)).image_embeds
-        else:
-            clip_image_embeds = clip_image_embeds.to(self.device, dtype=torch.float16)
-        if content_prompt_embeds is not None:
-            clip_image_embeds = clip_image_embeds - content_prompt_embeds
-        neg_image_prompt_embeds = self.image_proj_model(clip_image_embeds)
-        return neg_image_prompt_embeds
-    def generate(
-        self,
-        pil_image,
-        neg_pil_image=None,
-        prompt=None,
-        negative_prompt=None,
-        scale=1.0,
-        num_samples=4,
-        seed=None,
-        num_inference_steps=30,
-        neg_content_emb=None,
-        neg_content_prompt=None,
-        neg_content_scale=1.0,
-        **kwargs,
-    ):
-        self.set_scale(scale)
-        num_prompts = 1 if isinstance(pil_image, Image.Image) else len(pil_image)
-        if prompt is None:
-            prompt = "best quality, high quality"
-        if negative_prompt is None:
-            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
-        if not isinstance(prompt, List):
-            prompt = [prompt] * num_prompts
-        if not isinstance(negative_prompt, List):
-            negative_prompt = [negative_prompt] * num_prompts
-        if neg_content_emb is None:
-            if neg_content_prompt is not None:
-                with torch.inference_mode():
-                    (
-                        prompt_embeds_, # torch.Size([1, 77, 2048])
-                        negative_prompt_embeds_,
-                        pooled_prompt_embeds_, # torch.Size([1, 1280])
-                        negative_pooled_prompt_embeds_,
-                    ) = self.pipe.encode_prompt(
-                        neg_content_prompt,
-                        num_images_per_prompt=num_samples,
-                        do_classifier_free_guidance=True,
-                        negative_prompt=negative_prompt,
-                    )
-                    pooled_prompt_embeds_ *= neg_content_scale
-            else:
-                pooled_prompt_embeds_ = neg_content_emb
-        else:
-            pooled_prompt_embeds_ = None
-        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(pil_image, content_prompt_embeds=pooled_prompt_embeds_)
-        if neg_pil_image is not None:
-            neg_image_prompt_embeds = self.get_neg_image_embeds(neg_pil_image)
-            cos_sim_neg = F.cosine_similarity(image_prompt_embeds, neg_image_prompt_embeds.squeeze(0).unsqueeze(1), dim=-1)
-            cos_sim_uncond = F.cosine_similarity(image_prompt_embeds, uncond_image_prompt_embeds.squeeze(0).unsqueeze(1), dim=-1)
-            print(f"neg cos sim is: {cos_sim_neg.diagonal()}")
-            print(f"uncond cos sim is: {cos_sim_uncond.diagonal()}")
-            uncond_image_prompt_embeds = neg_image_prompt_embeds
-        bs_embed, seq_len, _ = image_prompt_embeds.shape
-        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
-        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
-        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
-        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
-        with torch.inference_mode():
-            (
-                prompt_embeds,
-                negative_prompt_embeds,
-                pooled_prompt_embeds,
-                negative_pooled_prompt_embeds,
-            ) = self.pipe.encode_prompt(
-                prompt,
-                num_images_per_prompt=num_samples,
-                do_classifier_free_guidance=True,
-                negative_prompt=negative_prompt,
-            )
-            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
-            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
-        # self.generator = get_generator(seed, self.device)
-        images = self.pipe(
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-            num_inference_steps=num_inference_steps,
-            # generator=self.generator,
-            **kwargs,
-        ).images
-        return images
-class IPAdapterPlus(IPAdapter):
-    """IP-Adapter with fine-grained features"""
-    def init_proj(self):
-        image_proj_model = Resampler(
-            dim=self.pipe.unet.config.cross_attention_dim,
-            depth=4,
-            dim_head=64,
-            heads=12,
-            num_queries=self.num_tokens,
-            embedding_dim=self.image_encoder.config.hidden_size,
-            output_dim=self.pipe.unet.config.cross_attention_dim,
-            ff_mult=4,
-        ).to(self.device, dtype=torch.float16)
-        return image_proj_model
-    @torch.inference_mode()
-    def get_image_embeds(self, pil_image=None, clip_image_embeds=None):
-        if isinstance(pil_image, Image.Image):
-            pil_image = [pil_image]
-        clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
-        clip_image = clip_image.to(self.device, dtype=torch.float16)
-        clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
-        image_prompt_embeds = self.image_proj_model(clip_image_embeds)
-        uncond_clip_image_embeds = self.image_encoder(
-            torch.zeros_like(clip_image), output_hidden_states=True
-        ).hidden_states[-2]
-        uncond_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds)
-        return image_prompt_embeds, uncond_image_prompt_embeds
-class IPAdapterFull(IPAdapterPlus):
-    """IP-Adapter with full features"""
-    def init_proj(self):
-        image_proj_model = MLPProjModel(
-            cross_attention_dim=self.pipe.unet.config.cross_attention_dim,
-            clip_embeddings_dim=self.image_encoder.config.hidden_size,
-        ).to(self.device, dtype=torch.float16)
-        return image_proj_model
-class IPAdapterPlusXL(IPAdapter):
-    """SDXL"""
-    def init_proj(self):
-        image_proj_model = Resampler(
-            dim=1280,
-            depth=4,
-            dim_head=64,
-            heads=20,
-            num_queries=self.num_tokens,
-            embedding_dim=self.image_encoder.config.hidden_size,
-            output_dim=self.pipe.unet.config.cross_attention_dim,
-            ff_mult=4,
-        ).to(self.device, dtype=torch.float16)
-        return image_proj_model
-    @torch.inference_mode()
-    def get_image_embeds(self, pil_image):
-        if isinstance(pil_image, Image.Image):
-            pil_image = [pil_image]
-        clip_image = self.clip_image_processor(images=pil_image, return_tensors="pt").pixel_values
-        clip_image = clip_image.to(self.device, dtype=torch.float16)
-        clip_image_embeds = self.image_encoder(clip_image, output_hidden_states=True).hidden_states[-2]
-        image_prompt_embeds = self.image_proj_model(clip_image_embeds)
-        uncond_clip_image_embeds = self.image_encoder(
-            torch.zeros_like(clip_image), output_hidden_states=True
-        ).hidden_states[-2]
-        uncond_image_prompt_embeds = self.image_proj_model(uncond_clip_image_embeds)
-        return image_prompt_embeds, uncond_image_prompt_embeds
-    def generate(
-            self,
-            pil_image,
-            prompt=None,
-            negative_prompt=None,
-            scale=1.0,
-            num_samples=4,
-            seed=None,
-            num_inference_steps=30,
-            **kwargs,
-    ):
-        self.set_scale(scale)
-        num_prompts = 1 if isinstance(pil_image, Image.Image) else len(pil_image)
-        if prompt is None:
-            prompt = "best quality, high quality"
-        if negative_prompt is None:
-            negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
-        if not isinstance(prompt, List):
-            prompt = [prompt] * num_prompts
-        if not isinstance(negative_prompt, List):
-            negative_prompt = [negative_prompt] * num_prompts
-        image_prompt_embeds, uncond_image_prompt_embeds = self.get_image_embeds(pil_image)
-        bs_embed, seq_len, _ = image_prompt_embeds.shape
-        image_prompt_embeds = image_prompt_embeds.repeat(1, num_samples, 1)
-        image_prompt_embeds = image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
-        uncond_image_prompt_embeds = uncond_image_prompt_embeds.repeat(1, num_samples, 1)
-        uncond_image_prompt_embeds = uncond_image_prompt_embeds.view(bs_embed * num_samples, seq_len, -1)
-        with torch.inference_mode():
-            (
-                prompt_embeds,
-                negative_prompt_embeds,
-                pooled_prompt_embeds,
-                negative_pooled_prompt_embeds,
-            ) = self.pipe.encode_prompt(
-                prompt,
-                num_images_per_prompt=num_samples,
-                do_classifier_free_guidance=True,
-                negative_prompt=negative_prompt,
-            )
-            prompt_embeds = torch.cat([prompt_embeds, image_prompt_embeds], dim=1)
-            negative_prompt_embeds = torch.cat([negative_prompt_embeds, uncond_image_prompt_embeds], dim=1)
-        generator = get_generator(seed, self.device)
-        images = self.pipe(
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-            num_inference_steps=num_inference_steps,
-            generator=generator,
-            **kwargs,
-        ).images
-        return images

         IPAttnProcessor2_0 as IPAttnProcessor,
     )
     from .attention_processor import IP_CS_AttnProcessor2_0 as IP_CS_AttnProcessor
     from .attention_processor import AttnProcessor2_0_exp as AttnProcessor_exp
     from .attention_processor import AttnProcessor2_0_hijack as AttnProcessor_hijack
     from .attention_processor import IPAttnProcessor2_0_cross_modal as IPAttnProcessor_cross_modal
                     if block_name in name:
                         selected = True
                         # print(name)
+                        attn_procs[name] = IPAttnProcessor_cross_modal(
                             hidden_size=hidden_size,
                             cross_attention_dim=cross_attention_dim,
                             style_scale=1.0,
                             attn_name=name,
                         )
                 if selected is False:
+                    attn_procs[name] = IPAttnProcessor_cross_modal(
                         hidden_size=hidden_size,
                         cross_attention_dim=cross_attention_dim,
                         num_style_tokens=self.num_style_tokens,
     def set_scale(self, style_scale):
         for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor_cross_modal):
                 if attn_processor.style is True:
                     attn_processor.style_scale = style_scale
                     # print('style_scale:',style_scale)
             if isinstance(attn_processor, AttnProcessor_hijack):
                 attn_processor.fuSAttn = use_SAttn
+    def set_num_inference_step(self, num_T):
+        for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, AttnProcessor_hijack) or isinstance(attn_processor, IPAttnProcessor_cross_modal):
+                attn_processor.num_inference_step = num_T
     def set_adain(self, use_CMA):
         for attn_processor in self.pipe.unet.attn_processors.values():
+            if isinstance(attn_processor, IPAttnProcessor_cross_modal):
                 attn_processor.adainIP = use_CMA
     def generate(
         self.set_endFusion(end_T = end_fusion)
         self.set_adain(use_CMA=cross_modal_adain)
         self.set_SAttn(use_SAttn=use_SAttn)
+        self.set_num_inference_step(num_T=num_inference_steps)
         # self.set_scale(style_scale=style_scale)
         num_prompts = 1 if isinstance(pil_style_image, Image.Image) else len(pil_style_image)
         ).images
         return images
 class IPAdapterXL(IPAdapter):
     """SDXL"""
             **kwargs,
         ).images
+        return images