Spaces:

Westlake-AGI-Lab
/

StyleStudio

Running on Zero

App Files Files Community

Leimingkun commited on 20 days ago

Commit

f343ea1

•

1 Parent(s): 123f4ac

stylestudio

Browse files

Files changed (11) hide show

.gitattributes +1 -0
app.py +44 -29
app_exp.py +244 -0
assets/style3.jpg +0 -0
ip_adapter/__pycache__/__init__.cpython-39.pyc +0 -0
ip_adapter/__pycache__/attention_processor.cpython-39.pyc +0 -0
ip_adapter/__pycache__/ip_adapter.cpython-39.pyc +0 -0
ip_adapter/__pycache__/resampler.cpython-39.pyc +0 -0
ip_adapter/__pycache__/utils.cpython-39.pyc +0 -0
ip_adapter/attention_processor.py +2 -0
ip_adapter/ip_adapter.py +2 -1

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+app_exp.py

app.py CHANGED Viewed

@@ -59,23 +59,37 @@ def get_example():
     case = [
         [
             './assets/style1.jpg',
-            "Text-Driven Style Synthesis",
             "A red apple",
             7.0,
             42,
-            20,
          ],
     ]
     return case
-def run_for_examples(style_image_pil, target, prompt, guidance_scale, seed, end_fusion):
     return create_image(
         style_image_pil=style_image_pil,
         prompt=prompt,
-        guidance_scale=7.0,
         num_inference_steps=50,
-        seed=42,
         end_fusion=end_fusion,
         use_SAttn=True,
         crossModalAdaIN=True,
@@ -86,21 +100,20 @@ def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
         seed = random.randint(0, MAX_SEED)
     return seed
-@spaces.GPU
-def create_image(
-                 style_image_pil,
                  prompt,
-                 guidance_scale,
-                 num_inference_steps,
-                 end_fusion,
-                 crossModalAdaIN,
-                 use_SAttn,
-                 seed,
                  neg_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
 ):
     style_image = style_image_pil
     generator = torch.Generator(device).manual_seed(seed)
     init_latents = torch.randn((1, 4, 128, 128), generator=generator, device="cuda", dtype=torch.float16)
     num_sample=1
@@ -122,6 +135,7 @@ def create_image(
                                 use_SAttn=use_SAttn,
                                 generator=generator,
                                 )
     if use_SAttn:
@@ -135,23 +149,28 @@ title = r"""
 """
 description = r"""
-<b>Official 🤗 Gradio demo</b> for <a href='https://github.com/MingKunLei/StyleStudio' target='_blank'><b>StyleStudio: Text-Driven Style Transfer with Selective Control of Style Elements</b></a>.<br>
 How to use:<br>
 1. Upload a style image.
-2. <b>Enter your desired prompt<b>.
 3. Click the <b>Submit</b> button to begin customization.
 4. Share your stylized photo with your friends and enjoy! 😊
 Advanced usage:<br>
 1. Click advanced options.
 2. Choose different guidance and steps.
-3. Set the timing for the Teacher Model's participation
 """
 article = r"""
 ---
 📝 **Tips**
-As the value of end_fusion increases, the style gradually diminishes.
 ---
 📝 **Citation**
 <br>
@@ -176,10 +195,6 @@ with block:
                     with gr.Column():
                         style_image_pil = gr.Image(label="Style Image", type='pil')
-                target = gr.Radio(["Text-Driven Style Synthesis"],
-                                  value="Text-Driven Style Synthesis",
-                                  label="task")
                 prompt = gr.Textbox(label="Prompt",
                                     value="A red apple")
@@ -190,14 +205,14 @@ with block:
                     guidance_scale = gr.Slider(minimum=1, maximum=15.0, step=0.01, value=7.0, label="guidance scale")
-                    num_inference_steps = gr.Slider(minimum=5, maximum=100.0, step=1.0, value=50,
                                                     label="num inference steps")
-                    end_fusion = gr.Slider(minimum=0, maximum=num_inference_steps, step=1.0, value=20.0, label="end fusion")
-                    seed = gr.Slider(minimum=-1000000, maximum=1000000, value=1, step=1, label="Seed Value")
-                    randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
                     crossModalAdaIN = gr.Checkbox(label="Cross Modal AdaIN", value=True)
                     use_SAttn = gr.Checkbox(label="Teacher Model", value=True)
@@ -218,18 +233,18 @@ with block:
             inputs=[
                     style_image_pil,
                     prompt,
                     guidance_scale,
                     num_inference_steps,
                     end_fusion,
                     crossModalAdaIN,
                     use_SAttn,
-                    seed,
-                    neg_prompt,],
             outputs=[generated_image])
     gr.Examples(
         examples=get_example(),
-        inputs=[style_image_pil, target, prompt, guidance_scale, seed, end_fusion],
         fn=run_for_examples,
         outputs=[generated_image],
         cache_examples=False,

     case = [
         [
             './assets/style1.jpg',
             "A red apple",
             7.0,
             42,
+            10,
+         ],
+        [
+            './assets/style2.jpg',
+            "A black car",
+            7.0,
+            42,
+            10,
+         ],
+        [
+            './assets/style3.jpg',
+            "A orange bus",
+            7.0,
+            42,
+            10,
          ],
     ]
     return case
+def run_for_examples(style_image_pil, prompt, guidance_scale, seed, end_fusion):
     return create_image(
         style_image_pil=style_image_pil,
         prompt=prompt,
+        neg_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
+        guidance_scale=guidance_scale,
         num_inference_steps=50,
+        seed=seed,
         end_fusion=end_fusion,
         use_SAttn=True,
         crossModalAdaIN=True,
         seed = random.randint(0, MAX_SEED)
     return seed
+def create_image(style_image_pil,
                  prompt,
                  neg_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
+                 guidance_scale=7,
+                 num_inference_steps=50,
+                 end_fusion=20,
+                 crossModalAdaIN=True,
+                 use_SAttn=True,
+                 seed=42,
 ):
     style_image = style_image_pil
+    print(seed)
     generator = torch.Generator(device).manual_seed(seed)
     init_latents = torch.randn((1, 4, 128, 128), generator=generator, device="cuda", dtype=torch.float16)
     num_sample=1
                                 use_SAttn=use_SAttn,
                                 generator=generator,
+                                latents=init_latents,
                                 )
     if use_SAttn:
 """
 description = r"""
+<b>Official 🤗 Gradio demo</b> for <a href='https://github.com/Westlake-AGI-Lab/StyleStudio' target='_blank'><b>StyleStudio: Text-Driven Style Transfer with Selective Control of Style Elements</b></a>.<br>
 How to use:<br>
 1. Upload a style image.
+2. <b>Enter your desired prompt</b>.
 3. Click the <b>Submit</b> button to begin customization.
 4. Share your stylized photo with your friends and enjoy! 😊
 Advanced usage:<br>
 1. Click advanced options.
 2. Choose different guidance and steps.
+3. Set the timing for the Teacher Model's participation.
+4. Feel free to discontinue using the Cross-Modal AdaIN and the Teacher Model for result comparison.
 """
 article = r"""
 ---
 📝 **Tips**
+<br>
+1. As the value of end_fusion <b>increases</b>, the style gradually diminishes.
+Therefore, it is suggested to set end_fusion to be between <b>1/5 and 1/3</b> of the number of inference steps (num inference steps).
+2. If you want to experience style-based CFG, see the details on the <a href="https://github.com/Westlake-AGI-Lab/StyleStudio">GitHub repo</a>.
 ---
 📝 **Citation**
 <br>
                     with gr.Column():
                         style_image_pil = gr.Image(label="Style Image", type='pil')
                 prompt = gr.Textbox(label="Prompt",
                                     value="A red apple")
                     guidance_scale = gr.Slider(minimum=1, maximum=15.0, step=0.01, value=7.0, label="guidance scale")
+                    num_inference_steps = gr.Slider(minimum=5, maximum=200.0, step=1.0, value=50,
                                                     label="num inference steps")
+                    end_fusion = gr.Slider(minimum=0, maximum=200, step=1.0, value=20.0, label="end fusion")
+                    seed = gr.Slider(minimum=-1000000, maximum=1000000, value=42, step=1, label="Seed Value")
+                    randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
                     crossModalAdaIN = gr.Checkbox(label="Cross Modal AdaIN", value=True)
                     use_SAttn = gr.Checkbox(label="Teacher Model", value=True)
             inputs=[
                     style_image_pil,
                     prompt,
+                    neg_prompt,
                     guidance_scale,
                     num_inference_steps,
                     end_fusion,
                     crossModalAdaIN,
                     use_SAttn,
+                    seed,],
             outputs=[generated_image])
     gr.Examples(
         examples=get_example(),
+        inputs=[style_image_pil, prompt, guidance_scale, seed, end_fusion],
         fn=run_for_examples,
         outputs=[generated_image],
         cache_examples=False,

app_exp.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import sys
+sys.path.append("./")
+import gradio as gr
+import spaces
+import torch
+from ip_adapter.utils import BLOCKS as BLOCKS
+import numpy as np
+import random
+from diffusers import (
+    AutoencoderKL,
+    StableDiffusionXLPipeline,
+)
+from ip_adapter import StyleStudio_Adapter
+device = "cuda" if torch.cuda.is_available() else "cpu"
+dtype = torch.float16 if str(device).__contains__("cuda") else torch.float32
+base_model_path =  "/mnt/agilab/models/sdxl"
+image_encoder_path = "/mnt/agilab/models/ipadapter_sdxl/image_encoder"
+csgo_ckpt = "/mnt/agilab/models/CSGO/csgo_4_32.bin"
+pretrained_vae_name_or_path = '/mnt/agilab/models/madebyollin_sdxl-vae-fp16-fix'
+weight_dtype = torch.float16
+vae = AutoencoderKL.from_pretrained(pretrained_vae_name_or_path,torch_dtype=torch.float16)
+pipe = StableDiffusionXLPipeline.from_pretrained(
+    base_model_path,
+    torch_dtype=torch.float16,
+    add_watermarker=False,
+    vae=vae
+)
+pipe.enable_vae_tiling()
+target_style_blocks = BLOCKS['style']
+csgo = StyleStudio_Adapter(
+        pipe, image_encoder_path, csgo_ckpt, device, num_style_tokens=32,
+        target_style_blocks=target_style_blocks,
+        controlnet_adapter=False,
+        style_model_resampler=True,
+        fuSAttn=True,
+        end_fusion=20,
+        adainIP=True,
+        )
+MAX_SEED = np.iinfo(np.int32).max
+def get_example():
+    case = [
+        [
+            './assets/style1.jpg',
+            "A red apple",
+            7.0,
+            42,
+            10,
+         ],
+        [
+            './assets/style2.jpg',
+            "A black car",
+            7.0,
+            42,
+            10,
+         ],
+        [
+            './assets/style3.jpg',
+            "A orange bus",
+            7.0,
+            42,
+            10,
+         ],
+    ]
+    return case
+def run_for_examples(style_image_pil, prompt, guidance_scale, seed, end_fusion):
+    return create_image(
+        style_image_pil=style_image_pil,
+        prompt=prompt,
+        neg_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
+        guidance_scale=guidance_scale,
+        num_inference_steps=50,
+        seed=seed,
+        end_fusion=end_fusion,
+        use_SAttn=True,
+        crossModalAdaIN=True,
+    )
+def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    return seed
+def create_image(style_image_pil,
+                 prompt,
+                 neg_prompt="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry",
+                 guidance_scale=7,
+                 num_inference_steps=50,
+                 end_fusion=20,
+                 crossModalAdaIN=True,
+                 use_SAttn=True,
+                 seed=42,
+):
+    style_image = style_image_pil
+    generator = torch.Generator(device).manual_seed(seed)
+    init_latents = torch.randn((1, 4, 128, 128), generator=generator, device="cuda", dtype=torch.float16)
+    num_sample=1
+    if use_SAttn:
+        num_sample=2
+        init_latents = init_latents.repeat(num_sample, 1, 1, 1)
+    with torch.no_grad():
+        images = csgo.generate(pil_style_image=style_image,
+                                prompt=prompt,
+                                negative_prompt=neg_prompt,
+                                height=1024,
+                                width=1024,
+                                guidance_scale=guidance_scale,
+                                num_images_per_prompt=1,
+                                num_samples=num_sample,
+                                num_inference_steps=num_inference_steps,
+                                end_fusion=end_fusion,
+                                cross_modal_adain=crossModalAdaIN,
+                                use_SAttn=use_SAttn,
+                                generator=generator,
+                                latents=init_latents,
+                                )
+    if use_SAttn:
+        return [images[1]]
+    else:
+        return [images[0]]
+# Description
+title = r"""
+<h1 align="center">StyleStudio: Text-Driven Style Transfer with Selective Control of Style Elements</h1>
+"""
+description = r"""
+<b>Official 🤗 Gradio demo</b> for <a href='https://github.com/Westlake-AGI-Lab/StyleStudio' target='_blank'><b>StyleStudio: Text-Driven Style Transfer with Selective Control of Style Elements</b></a>.<br>
+How to use:<br>
+1. Upload a style image.
+2. <b>Enter your desired prompt</b>.
+3. Click the <b>Submit</b> button to begin customization.
+4. Share your stylized photo with your friends and enjoy! 😊
+Advanced usage:<br>
+1. Click advanced options.
+2. Choose different guidance and steps.
+3. Set the timing for the Teacher Model's participation.
+4.
+"""
+article = r"""
+---
+📝 **Tips**
+<br>
+1. As the value of end_fusion <b>increases</b>, the style gradually diminishes.
+Therefore, it is suggested to set end_fusion to be between 1/5 and 1/3 of the number of inference steps (num inference steps).
+2. If you want to experience style-based CFG, see the details on the <a href="https://github.com/Westlake-AGI-Lab/StyleStudio">GitHub repo</a>.
+---
+📝 **Citation**
+<br>
+If our work is helpful for your research or applications, please cite us via:
+```bibtex
+```
+📧 **Contact**
+<br>
+If you have any questions, please feel free to open an issue or directly reach us out at <b>leimingkun@westlake.edu.cn</b>.
+"""
+block = gr.Blocks(css="footer {visibility: hidden}").queue(max_size=10, api_open=False)
+with block:
+    gr.Markdown(title)
+    gr.Markdown(description)
+    with gr.Tabs():
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    with gr.Column():
+                        style_image_pil = gr.Image(label="Style Image", type='pil')
+                prompt = gr.Textbox(label="Prompt",
+                                    value="A red apple")
+                neg_prompt = gr.Textbox(label="Negative Prompt",
+                                    value="text, watermark, lowres, low quality, worst quality, deformed, glitch, low contrast, noisy, saturation, blurry")
+                with gr.Accordion(open=True, label="Advanced Options"):
+                    guidance_scale = gr.Slider(minimum=1, maximum=15.0, step=0.01, value=7.0, label="guidance scale")
+                    num_inference_steps = gr.Slider(minimum=5, maximum=200.0, step=1.0, value=50,
+                                                    label="num inference steps")
+                    end_fusion = gr.Slider(minimum=0, maximum=200, step=1.0, value=20.0, label="end fusion")
+                    seed = gr.Slider(minimum=-1000000, maximum=1000000, value=42, step=1, label="Seed Value")
+                    randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
+                    crossModalAdaIN = gr.Checkbox(label="Cross Modal AdaIN", value=True)
+                    use_SAttn = gr.Checkbox(label="Teacher Model", value=True)
+                generate_button = gr.Button("Generate Image")
+            with gr.Column():
+                generated_image = gr.Gallery(label="Generated Image")
+        generate_button.click(
+            fn=randomize_seed_fn,
+            inputs=[seed, randomize_seed],
+            outputs=seed,
+            queue=False,
+            api_name=False,
+        ).then(
+            fn=create_image,
+            inputs=[
+                    style_image_pil,
+                    prompt,
+                    neg_prompt,
+                    guidance_scale,
+                    num_inference_steps,
+                    end_fusion,
+                    crossModalAdaIN,
+                    use_SAttn,
+                    seed,],
+            outputs=[generated_image])
+    gr.Examples(
+        examples=get_example(),
+        inputs=[style_image_pil, prompt, guidance_scale, seed, end_fusion],
+        fn=run_for_examples,
+        outputs=[generated_image],
+        cache_examples=False,
+    )
+    gr.Markdown(article)
+block.launch(server_name="0.0.0.0", server_port=1234)

assets/style3.jpg ADDED Viewed

ip_adapter/__pycache__/__init__.cpython-39.pyc CHANGED Viewed

Binary files a/ip_adapter/__pycache__/__init__.cpython-39.pyc and b/ip_adapter/__pycache__/__init__.cpython-39.pyc differ

ip_adapter/__pycache__/attention_processor.cpython-39.pyc CHANGED Viewed

Binary files a/ip_adapter/__pycache__/attention_processor.cpython-39.pyc and b/ip_adapter/__pycache__/attention_processor.cpython-39.pyc differ

ip_adapter/__pycache__/ip_adapter.cpython-39.pyc CHANGED Viewed

Binary files a/ip_adapter/__pycache__/ip_adapter.cpython-39.pyc and b/ip_adapter/__pycache__/ip_adapter.cpython-39.pyc differ

ip_adapter/__pycache__/resampler.cpython-39.pyc CHANGED Viewed

Binary files a/ip_adapter/__pycache__/resampler.cpython-39.pyc and b/ip_adapter/__pycache__/resampler.cpython-39.pyc differ

ip_adapter/__pycache__/utils.cpython-39.pyc CHANGED Viewed

Binary files a/ip_adapter/__pycache__/utils.cpython-39.pyc and b/ip_adapter/__pycache__/utils.cpython-39.pyc differ

ip_adapter/attention_processor.py CHANGED Viewed

@@ -838,6 +838,8 @@ class AttnProcessor2_0_hijack(torch.nn.Module):
         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1
         if self.fuSAttn and self.denoise_step <= self.end_fusion:
             assert query.shape[0] == 4
             scale_factor = 1 / math.sqrt(torch.tensor(head_dim, dtype=query.dtype))
             attn_probs = (torch.matmul(query, key.transpose(-2, -1)) * scale_factor).softmax(dim=-1)

         # the output of sdp = (batch, num_heads, seq_len, head_dim)
         # TODO: add support for attn.scale when we move to Torch 2.1
         if self.fuSAttn and self.denoise_step <= self.end_fusion:
+            if self.end_fusion == 0:
+                print("yes")
             assert query.shape[0] == 4
             scale_factor = 1 / math.sqrt(torch.tensor(head_dim, dtype=query.dtype))
             attn_probs = (torch.matmul(query, key.transpose(-2, -1)) * scale_factor).softmax(dim=-1)

ip_adapter/ip_adapter.py CHANGED Viewed

@@ -1121,6 +1121,7 @@ class StyleStudio_Adapter(CSGO):
         for attn_processor in self.pipe.unet.attn_processors.values():
             if isinstance(attn_processor, AttnProcessor_hijack) or isinstance(attn_processor, IPAttnProcessor_cross_modal):
                 attn_processor.num_inference_step = num_T
     def set_adain(self, use_CMA):
         for attn_processor in self.pipe.unet.attn_processors.values():
@@ -1143,7 +1144,7 @@ class StyleStudio_Adapter(CSGO):
             use_SAttn=True,
             **kwargs,
     ):
         self.set_endFusion(end_T = end_fusion)
         self.set_adain(use_CMA=cross_modal_adain)
         self.set_SAttn(use_SAttn=use_SAttn)

         for attn_processor in self.pipe.unet.attn_processors.values():
             if isinstance(attn_processor, AttnProcessor_hijack) or isinstance(attn_processor, IPAttnProcessor_cross_modal):
                 attn_processor.num_inference_step = num_T
+                attn_processor.denoise_step = 0
     def set_adain(self, use_CMA):
         for attn_processor in self.pipe.unet.attn_processors.values():
             use_SAttn=True,
             **kwargs,
     ):
+        print(end_fusion)
         self.set_endFusion(end_T = end_fusion)
         self.set_adain(use_CMA=cross_modal_adain)
         self.set_SAttn(use_SAttn=use_SAttn)