sd-diffusers-webui

Runtime error

App Files Files Community

Lightxr

nyanko7 commited on Feb 26, 2023

Commit

6fd31c7

•

0 Parent(s):

Duplicate from nyanko7/sd-diffusers-webui

Browse files

Co-authored-by: Nyanko <nyanko7@users.noreply.huggingface.co>

Files changed (8) hide show

.gitattributes +34 -0
Dockerfile +22 -0
README.md +14 -0
app.py +878 -0
modules/lora.py +183 -0
modules/model.py +897 -0
modules/prompt_parser.py +391 -0
modules/safe.py +188 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+# Dockerfile Public T4
+FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND noninteractive
+WORKDIR /content
+RUN apt-get update -y && apt-get upgrade -y && apt-get install -y libgl1 libglib2.0-0 wget git git-lfs python3-pip python-is-python3 && pip3 install --upgrade pip
+RUN pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchsde --extra-index-url https://download.pytorch.org/whl/cu113
+RUN pip install https://github.com/camenduru/stable-diffusion-webui-colab/releases/download/0.0.16/xformers-0.0.16+814314d.d20230118-cp310-cp310-linux_x86_64.whl
+RUN pip install --pre triton
+RUN pip install numexpr einops transformers k_diffusion safetensors gradio diffusers==0.12.1
+ADD . .
+RUN adduser --disabled-password --gecos '' user
+RUN chown -R user:user /content
+RUN chmod -R 777 /content
+USER user
+EXPOSE 7860
+CMD python /content/app.py

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Sd Diffusers Webui
+emoji: 🐳
+colorFrom: purple
+colorTo: gray
+sdk: docker
+sdk_version: 3.9
+pinned: false
+license: openrail
+app_port: 7860
+duplicated_from: nyanko7/sd-diffusers-webui
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,878 @@

+import random
+import tempfile
+import time
+import gradio as gr
+import numpy as np
+import torch
+import math
+import re
+from gradio import inputs
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    UNet2DConditionModel,
+)
+from modules.model import (
+    CrossAttnProcessor,
+    StableDiffusionPipeline,
+)
+from torchvision import transforms
+from transformers import CLIPTokenizer, CLIPTextModel
+from PIL import Image
+from pathlib import Path
+from safetensors.torch import load_file
+import modules.safe as _
+from modules.lora import LoRANetwork
+models = [
+    ("AbyssOrangeMix2", "Korakoe/AbyssOrangeMix2-HF", 2),
+    ("Pastal Mix", "andite/pastel-mix", 2),
+    ("Basil Mix", "nuigurumi/basil_mix", 2)
+]
+keep_vram = ["Korakoe/AbyssOrangeMix2-HF", "andite/pastel-mix"]
+base_name, base_model, clip_skip = models[0]
+samplers_k_diffusion = [
+    ("Euler a", "sample_euler_ancestral", {}),
+    ("Euler", "sample_euler", {}),
+    ("LMS", "sample_lms", {}),
+    ("Heun", "sample_heun", {}),
+    ("DPM2", "sample_dpm_2", {"discard_next_to_last_sigma": True}),
+    ("DPM2 a", "sample_dpm_2_ancestral", {"discard_next_to_last_sigma": True}),
+    ("DPM++ 2S a", "sample_dpmpp_2s_ancestral", {}),
+    ("DPM++ 2M", "sample_dpmpp_2m", {}),
+    ("DPM++ SDE", "sample_dpmpp_sde", {}),
+    ("LMS Karras", "sample_lms", {"scheduler": "karras"}),
+    ("DPM2 Karras", "sample_dpm_2", {"scheduler": "karras", "discard_next_to_last_sigma": True}),
+    ("DPM2 a Karras", "sample_dpm_2_ancestral", {"scheduler": "karras", "discard_next_to_last_sigma": True}),
+    ("DPM++ 2S a Karras", "sample_dpmpp_2s_ancestral", {"scheduler": "karras"}),
+    ("DPM++ 2M Karras", "sample_dpmpp_2m", {"scheduler": "karras"}),
+    ("DPM++ SDE Karras", "sample_dpmpp_sde", {"scheduler": "karras"}),
+]
+# samplers_diffusers = [
+#     ("DDIMScheduler", "diffusers.schedulers.DDIMScheduler", {})
+#     ("DDPMScheduler", "diffusers.schedulers.DDPMScheduler", {})
+#     ("DEISMultistepScheduler", "diffusers.schedulers.DEISMultistepScheduler", {})
+# ]
+start_time = time.time()
+timeout = 90
+scheduler = DDIMScheduler.from_pretrained(
+    base_model,
+    subfolder="scheduler",
+)
+vae = AutoencoderKL.from_pretrained(
+    "stabilityai/sd-vae-ft-ema",
+    torch_dtype=torch.float16
+)
+text_encoder = CLIPTextModel.from_pretrained(
+    base_model,
+    subfolder="text_encoder",
+    torch_dtype=torch.float16,
+)
+tokenizer = CLIPTokenizer.from_pretrained(
+    base_model,
+    subfolder="tokenizer",
+    torch_dtype=torch.float16,
+)
+unet = UNet2DConditionModel.from_pretrained(
+    base_model,
+    subfolder="unet",
+    torch_dtype=torch.float16,
+)
+pipe = StableDiffusionPipeline(
+    text_encoder=text_encoder,
+    tokenizer=tokenizer,
+    unet=unet,
+    vae=vae,
+    scheduler=scheduler,
+)
+unet.set_attn_processor(CrossAttnProcessor)
+pipe.setup_text_encoder(clip_skip, text_encoder)
+if torch.cuda.is_available():
+    pipe = pipe.to("cuda")
+def get_model_list():
+    return models
+te_cache = {
+    base_model: text_encoder
+}
+unet_cache = {
+    base_model: unet
+}
+lora_cache = {
+    base_model: LoRANetwork(text_encoder, unet)
+}
+te_base_weight_length = text_encoder.get_input_embeddings().weight.data.shape[0]
+original_prepare_for_tokenization = tokenizer.prepare_for_tokenization
+current_model = base_model
+def setup_model(name, lora_state=None, lora_scale=1.0):
+    global pipe, current_model
+    keys = [k[0] for k in models]
+    model = models[keys.index(name)][1]
+    if model not in unet_cache:
+        unet = UNet2DConditionModel.from_pretrained(model, subfolder="unet", torch_dtype=torch.float16)
+        text_encoder = CLIPTextModel.from_pretrained(model, subfolder="text_encoder", torch_dtype=torch.float16)
+        unet_cache[model] = unet
+        te_cache[model] = text_encoder
+        lora_cache[model] = LoRANetwork(text_encoder, unet)
+    if current_model != model:
+        if current_model not in keep_vram:
+            # offload current model
+            unet_cache[current_model].to("cpu")
+            te_cache[current_model].to("cpu")
+            lora_cache[current_model].to("cpu")
+        current_model = model
+    local_te, local_unet, local_lora, = te_cache[model], unet_cache[model], lora_cache[model]
+    local_unet.set_attn_processor(CrossAttnProcessor())
+    local_lora.reset()
+    clip_skip = models[keys.index(name)][2]
+    if torch.cuda.is_available():
+        local_unet.to("cuda")
+        local_te.to("cuda")
+    if lora_state is not None and lora_state != "":
+        local_lora.load(lora_state, lora_scale)
+        local_lora.to(local_unet.device, dtype=local_unet.dtype)
+    pipe.text_encoder, pipe.unet = local_te, local_unet
+    pipe.setup_unet(local_unet)
+    pipe.tokenizer.prepare_for_tokenization = original_prepare_for_tokenization
+    pipe.tokenizer.added_tokens_encoder = {}
+    pipe.tokenizer.added_tokens_decoder = {}
+    pipe.setup_text_encoder(clip_skip, local_te)
+    return pipe
+def error_str(error, title="Error"):
+    return (
+        f"""#### {title}
+            {error}"""
+        if error
+        else ""
+    )
+def make_token_names(embs):
+    all_tokens = []
+    for name, vec in embs.items():
+        tokens = [f'emb-{name}-{i}' for i in range(len(vec))]
+        all_tokens.append(tokens)
+    return all_tokens
+def setup_tokenizer(tokenizer, embs):
+    reg_match = [re.compile(fr"(?:^|(?<=\s|,)){k}(?=,|\s|$)") for k in embs.keys()]
+    clip_keywords = [' '.join(s) for s in make_token_names(embs)]
+    def parse_prompt(prompt: str):
+        for m, v in zip(reg_match, clip_keywords):
+            prompt = m.sub(v, prompt)
+        return prompt
+    def prepare_for_tokenization(self, text: str, is_split_into_words: bool = False, **kwargs):
+        text = parse_prompt(text)
+        r = original_prepare_for_tokenization(text, is_split_into_words, **kwargs)
+        return r
+        tokenizer.prepare_for_tokenization = prepare_for_tokenization.__get__(tokenizer, CLIPTokenizer)
+    return [t for sublist in make_token_names(embs) for t in sublist]
+def convert_size(size_bytes):
+    if size_bytes == 0:
+        return "0B"
+    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+    i = int(math.floor(math.log(size_bytes, 1024)))
+    p = math.pow(1024, i)
+    s = round(size_bytes / p, 2)
+    return "%s %s" % (s, size_name[i])
+def inference(
+    prompt,
+    guidance,
+    steps,
+    width=512,
+    height=512,
+    seed=0,
+    neg_prompt="",
+    state=None,
+    g_strength=0.4,
+    img_input=None,
+    i2i_scale=0.5,
+    hr_enabled=False,
+    hr_method="Latent",
+    hr_scale=1.5,
+    hr_denoise=0.8,
+    sampler="DPM++ 2M Karras",
+    embs=None,
+    model=None,
+    lora_state=None,
+    lora_scale=None,
+):
+    if seed is None or seed == 0:
+        seed = random.randint(0, 2147483647)
+    pipe = setup_model(model, lora_state, lora_scale)
+    generator = torch.Generator("cuda").manual_seed(int(seed))
+    start_time = time.time()
+    sampler_name, sampler_opt = None, None
+    for label, funcname, options in samplers_k_diffusion:
+        if label == sampler:
+            sampler_name, sampler_opt = funcname, options
+    tokenizer, text_encoder = pipe.tokenizer, pipe.text_encoder
+    if embs is not None and len(embs) > 0:
+        ti_embs = {}
+        for name, file in embs.items():
+            if str(file).endswith(".pt"):
+                loaded_learned_embeds = torch.load(file, map_location="cpu")
+            else:
+                loaded_learned_embeds = load_file(file, device="cpu")
+            loaded_learned_embeds = loaded_learned_embeds["string_to_param"]["*"] if "string_to_param" in loaded_learned_embed else loaded_learned_embed
+            ti_embs[name] = loaded_learned_embeds
+        if len(ti_embs) > 0:
+            tokens = setup_tokenizer(tokenizer, ti_embs)
+            added_tokens = tokenizer.add_tokens(tokens)
+            delta_weight = torch.cat([val for val in ti_embs.values()], dim=0)
+            assert added_tokens == delta_weight.shape[0]
+            text_encoder.resize_token_embeddings(len(tokenizer))
+            token_embeds = text_encoder.get_input_embeddings().weight.data
+            token_embeds[-delta_weight.shape[0]:] = delta_weight
+    config = {
+        "negative_prompt": neg_prompt,
+        "num_inference_steps": int(steps),
+        "guidance_scale": guidance,
+        "generator": generator,
+        "sampler_name": sampler_name,
+        "sampler_opt": sampler_opt,
+        "pww_state": state,
+        "pww_attn_weight": g_strength,
+        "start_time": start_time,
+        "timeout": timeout,
+    }
+    if img_input is not None:
+        ratio = min(height / img_input.height, width / img_input.width)
+        img_input = img_input.resize(
+            (int(img_input.width * ratio), int(img_input.height * ratio)), Image.LANCZOS
+        )
+        result = pipe.img2img(prompt, image=img_input, strength=i2i_scale, **config)
+    elif hr_enabled:
+        result = pipe.txt2img(
+            prompt,
+            width=width,
+            height=height,
+            upscale=True,
+            upscale_x=hr_scale,
+            upscale_denoising_strength=hr_denoise,
+            **config,
+            **latent_upscale_modes[hr_method],
+        )
+    else:
+        result = pipe.txt2img(prompt, width=width, height=height, **config)
+    end_time = time.time()
+    vram_free, vram_total = torch.cuda.mem_get_info()
+    print(f"done: model={model}, res={width}x{height}, step={steps}, time={round(end_time-start_time, 2)}s, vram_alloc={convert_size(vram_total-vram_free)}/{convert_size(vram_total)}")
+    return gr.Image.update(result[0][0], label=f"Initial Seed: {seed}")
+color_list = []
+def get_color(n):
+    for _ in range(n - len(color_list)):
+        color_list.append(tuple(np.random.random(size=3) * 256))
+    return color_list
+def create_mixed_img(current, state, w=512, h=512):
+    w, h = int(w), int(h)
+    image_np = np.full([h, w, 4], 255)
+    if state is None:
+        state = {}
+    colors = get_color(len(state))
+    idx = 0
+    for key, item in state.items():
+        if item["map"] is not None:
+            m = item["map"] < 255
+            alpha = 150
+            if current == key:
+                alpha = 200
+            image_np[m] = colors[idx] + (alpha,)
+        idx += 1
+    return image_np
+# width.change(apply_new_res, inputs=[width, height, global_stats], outputs=[global_stats, sp, rendered])
+def apply_new_res(w, h, state):
+    w, h = int(w), int(h)
+    for key, item in state.items():
+        if item["map"] is not None:
+            item["map"] = resize(item["map"], w, h)
+    update_img = gr.Image.update(value=create_mixed_img("", state, w, h))
+    return state, update_img
+def detect_text(text, state, width, height):
+    if text is None or text == "":
+        return None, None, gr.Radio.update(value=None), None
+    t = text.split(",")
+    new_state = {}
+    for item in t:
+        item = item.strip()
+        if item == "":
+            continue
+        if state is not None and item in state:
+            new_state[item] = {
+                "map": state[item]["map"],
+                "weight": state[item]["weight"],
+                "mask_outsides": state[item]["mask_outsides"],
+            }
+        else:
+            new_state[item] = {
+                "map": None,
+                "weight": 0.5,
+                "mask_outsides": False
+            }
+    update = gr.Radio.update(choices=[key for key in new_state.keys()], value=None)
+    update_img = gr.update(value=create_mixed_img("", new_state, width, height))
+    update_sketch = gr.update(value=None, interactive=False)
+    return new_state, update_sketch, update, update_img
+def resize(img, w, h):
+    trs = transforms.Compose(
+        [
+            transforms.ToPILImage(),
+            transforms.Resize(min(h, w)),
+            transforms.CenterCrop((h, w)),
+        ]
+    )
+    result = np.array(trs(img), dtype=np.uint8)
+    return result
+def switch_canvas(entry, state, width, height):
+    if entry == None:
+        return None, 0.5, False, create_mixed_img("", state, width, height)
+    return (
+        gr.update(value=None, interactive=True),
+        gr.update(value=state[entry]["weight"] if entry in state else 0.5),
+        gr.update(value=state[entry]["mask_outsides"] if entry in state else False),
+        create_mixed_img(entry, state, width, height),
+    )
+def apply_canvas(selected, draw, state, w, h):
+    if selected in state:
+        w, h = int(w), int(h)
+        state[selected]["map"] = resize(draw, w, h)
+    return state, gr.Image.update(value=create_mixed_img(selected, state, w, h))
+def apply_weight(selected, weight, state):
+    if selected in state:
+        state[selected]["weight"] = weight
+    return state
+def apply_option(selected, mask, state):
+    if selected in state:
+        state[selected]["mask_outsides"] = mask
+    return state
+# sp2, radio, width, height, global_stats
+def apply_image(image, selected, w, h, strgength, mask, state):
+    if selected in state:
+        state[selected] = {
+            "map": resize(image, w, h),
+            "weight": strgength,
+            "mask_outsides": mask
+        }
+    return state, gr.Image.update(value=create_mixed_img(selected, state, w, h))
+# [ti_state, lora_state, ti_vals, lora_vals, uploads]
+def add_net(files, ti_state, lora_state):
+    if files is None:
+        return ti_state, "", lora_state, None
+    for file in files:
+        item = Path(file.name)
+        stripedname = str(item.stem).strip()
+        if item.suffix == ".pt":
+            state_dict = torch.load(file.name, map_location="cpu")
+        else:
+            state_dict = load_file(file.name, device="cpu")
+        if any("lora" in k for k in state_dict.keys()):
+            lora_state = file.name
+        else:
+            ti_state[stripedname] = file.name
+    return (
+        ti_state,
+        lora_state,
+        gr.Text.update(f"{[key for key in ti_state.keys()]}"),
+        gr.Text.update(f"{lora_state}"),
+        gr.Files.update(value=None),
+    )
+# [ti_state, lora_state, ti_vals, lora_vals, uploads]
+def clean_states(ti_state, lora_state):
+    return (
+        dict(),
+        None,
+        gr.Text.update(f""),
+        gr.Text.update(f""),
+        gr.File.update(value=None),
+    )
+latent_upscale_modes = {
+    "Latent": {"upscale_method": "bilinear", "upscale_antialias": False},
+    "Latent (antialiased)": {"upscale_method": "bilinear", "upscale_antialias": True},
+    "Latent (bicubic)": {"upscale_method": "bicubic", "upscale_antialias": False},
+    "Latent (bicubic antialiased)": {
+        "upscale_method": "bicubic",
+        "upscale_antialias": True,
+    },
+    "Latent (nearest)": {"upscale_method": "nearest", "upscale_antialias": False},
+    "Latent (nearest-exact)": {
+        "upscale_method": "nearest-exact",
+        "upscale_antialias": False,
+    },
+}
+css = """
+.finetuned-diffusion-div div{
+    display:inline-flex;
+    align-items:center;
+    gap:.8rem;
+    font-size:1.75rem;
+    padding-top:2rem;
+}
+.finetuned-diffusion-div div h1{
+    font-weight:900;
+    margin-bottom:7px
+}
+.finetuned-diffusion-div p{
+    margin-bottom:10px;
+    font-size:94%
+}
+.box {
+  float: left;
+  height: 20px;
+  width: 20px;
+  margin-bottom: 15px;
+  border: 1px solid black;
+  clear: both;
+}
+a{
+    text-decoration:underline
+}
+.tabs{
+    margin-top:0;
+    margin-bottom:0
+}
+#gallery{
+    min-height:20rem
+}
+.no-border {
+    border: none !important;
+}
+ """
+with gr.Blocks(css=css) as demo:
+    gr.HTML(
+        f"""
+            <div class="finetuned-diffusion-div">
+              <div>
+                <h1>Demo for diffusion models</h1>
+              </div>
+              <p>Hso @ nyanko.sketch2img.gradio</p>
+            </div>
+        """
+    )
+    global_stats = gr.State(value={})
+    with gr.Row():
+        with gr.Column(scale=55):
+            model = gr.Dropdown(
+                choices=[k[0] for k in get_model_list()],
+                label="Model",
+                value=base_name,
+            )
+            image_out = gr.Image(height=512)
+        # gallery = gr.Gallery(
+        #     label="Generated images", show_label=False, elem_id="gallery"
+        # ).style(grid=[1], height="auto")
+        with gr.Column(scale=45):
+            with gr.Group():
+                with gr.Row():
+                    with gr.Column(scale=70):
+                        prompt = gr.Textbox(
+                            label="Prompt",
+                            value="loli cat girl, blue eyes, flat chest, solo, long messy silver hair, blue capelet, cat ears, cat tail, upper body",
+                            show_label=True,
+                            max_lines=4,
+                            placeholder="Enter prompt.",
+                        )
+                        neg_prompt = gr.Textbox(
+                            label="Negative Prompt",
+                            value="bad quality, low quality, jpeg artifact, cropped",
+                            show_label=True,
+                            max_lines=4,
+                            placeholder="Enter negative prompt.",
+                        )
+                    generate = gr.Button(value="Generate").style(
+                        rounded=(False, True, True, False)
+                    )
+            with gr.Tab("Options"):
+                with gr.Group():
+                    # n_images = gr.Slider(label="Images", value=1, minimum=1, maximum=4, step=1)
+                    with gr.Row():
+                        guidance = gr.Slider(
+                            label="Guidance scale", value=7.5, maximum=15
+                        )
+                        steps = gr.Slider(
+                            label="Steps", value=25, minimum=2, maximum=50, step=1
+                        )
+                    with gr.Row():
+                        width = gr.Slider(
+                            label="Width", value=512, minimum=64, maximum=768, step=64
+                        )
+                        height = gr.Slider(
+                            label="Height", value=512, minimum=64, maximum=768, step=64
+                        )
+                    sampler = gr.Dropdown(
+                        value="DPM++ 2M Karras",
+                        label="Sampler",
+                        choices=[s[0] for s in samplers_k_diffusion],
+                    )
+                    seed = gr.Number(label="Seed (0 = random)", value=0)
+            with gr.Tab("Image to image"):
+                with gr.Group():
+                    inf_image = gr.Image(
+                        label="Image", height=256, tool="editor", type="pil"
+                    )
+                    inf_strength = gr.Slider(
+                        label="Transformation strength",
+                        minimum=0,
+                        maximum=1,
+                        step=0.01,
+                        value=0.5,
+                    )
+            def res_cap(g, w, h, x):
+                if g:
+                    return f"Enable upscaler: {w}x{h} to {int(w*x)}x{int(h*x)}"
+                else:
+                    return "Enable upscaler"
+            with gr.Tab("Hires fix"):
+                with gr.Group():
+                    hr_enabled = gr.Checkbox(label="Enable upscaler", value=False)
+                    hr_method = gr.Dropdown(
+                        [key for key in latent_upscale_modes.keys()],
+                        value="Latent",
+                        label="Upscale method",
+                    )
+                    hr_scale = gr.Slider(
+                        label="Upscale factor",
+                        minimum=1.0,
+                        maximum=1.5,
+                        step=0.1,
+                        value=1.2,
+                    )
+                    hr_denoise = gr.Slider(
+                        label="Denoising strength",
+                        minimum=0.0,
+                        maximum=1.0,
+                        step=0.1,
+                        value=0.8,
+                    )
+                    hr_scale.change(
+                        lambda g, x, w, h: gr.Checkbox.update(
+                            label=res_cap(g, w, h, x)
+                        ),
+                        inputs=[hr_enabled, hr_scale, width, height],
+                        outputs=hr_enabled,
+                        queue=False,
+                    )
+                    hr_enabled.change(
+                        lambda g, x, w, h: gr.Checkbox.update(
+                            label=res_cap(g, w, h, x)
+                        ),
+                        inputs=[hr_enabled, hr_scale, width, height],
+                        outputs=hr_enabled,
+                        queue=False,
+                    )
+            with gr.Tab("Embeddings/Loras"):
+                ti_state = gr.State(dict())
+                lora_state = gr.State()
+                with gr.Group():
+                    with gr.Row():
+                        with gr.Column(scale=90):
+                            ti_vals = gr.Text(label="Loaded embeddings")
+                    with gr.Row():
+                        with gr.Column(scale=90):
+                            lora_vals = gr.Text(label="Loaded loras")
+                with gr.Row():
+                    uploads = gr.Files(label="Upload new embeddings/lora")
+                    with gr.Column():
+                        lora_scale = gr.Slider(
+                            label="Lora scale",
+                            minimum=0,
+                            maximum=2,
+                            step=0.01,
+                            value=1.0,
+                        )
+                        btn = gr.Button(value="Upload")
+                        btn_del = gr.Button(value="Reset")
+                btn.click(
+                    add_net,
+                    inputs=[uploads, ti_state, lora_state],
+                    outputs=[ti_state, lora_state, ti_vals, lora_vals, uploads],
+                    queue=False,
+                )
+                btn_del.click(
+                    clean_states,
+                    inputs=[ti_state, lora_state],
+                    outputs=[ti_state, lora_state, ti_vals, lora_vals, uploads],
+                    queue=False,
+                )
+        # error_output = gr.Markdown()
+    gr.HTML(
+        f"""
+            <div class="finetuned-diffusion-div">
+              <div>
+                <h1>Paint with words</h1>
+              </div>
+              <p>
+                Will use the following formula: w = scale * token_weight_martix * log(1 + sigma) * max(qk).
+              </p>
+            </div>
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=55):
+            rendered = gr.Image(
+                invert_colors=True,
+                source="canvas",
+                interactive=False,
+                image_mode="RGBA",
+            )
+        with gr.Column(scale=45):
+            with gr.Group():
+                with gr.Row():
+                    with gr.Column(scale=70):
+                        g_strength = gr.Slider(
+                            label="Weight scaling",
+                            minimum=0,
+                            maximum=0.8,
+                            step=0.01,
+                            value=0.4,
+                        )
+                        text = gr.Textbox(
+                            lines=2,
+                            interactive=True,
+                            label="Token to Draw: (Separate by comma)",
+                        )
+                        radio = gr.Radio([], label="Tokens")
+                    sk_update = gr.Button(value="Update").style(
+                        rounded=(False, True, True, False)
+                    )
+                # g_strength.change(lambda b: gr.update(f"Scaled additional attn: $w = {b} \log (1 + \sigma) \std (Q^T K)$."), inputs=g_strength, outputs=[g_output])
+            with gr.Tab("SketchPad"):
+                sp = gr.Image(
+                    image_mode="L",
+                    tool="sketch",
+                    source="canvas",
+                    interactive=False,
+                )
+                mask_outsides = gr.Checkbox(
+                    label="Mask other areas",
+                    value=False
+                )
+                strength = gr.Slider(
+                    label="Token strength",
+                    minimum=0,
+                    maximum=0.8,
+                    step=0.01,
+                    value=0.5,
+                )
+                sk_update.click(
+                    detect_text,
+                    inputs=[text, global_stats, width, height],
+                    outputs=[global_stats, sp, radio, rendered],
+                    queue=False,
+                )
+                radio.change(
+                    switch_canvas,
+                    inputs=[radio, global_stats, width, height],
+                    outputs=[sp, strength, mask_outsides, rendered],
+                    queue=False,
+                )
+                sp.edit(
+                    apply_canvas,
+                    inputs=[radio, sp, global_stats, width, height],
+                    outputs=[global_stats, rendered],
+                    queue=False,
+                )
+                strength.change(
+                    apply_weight,
+                    inputs=[radio, strength, global_stats],
+                    outputs=[global_stats],
+                    queue=False,
+                )
+                mask_outsides.change(
+                    apply_option,
+                    inputs=[radio, mask_outsides, global_stats],
+                    outputs=[global_stats],
+                    queue=False,
+                )
+            with gr.Tab("UploadFile"):
+                sp2 = gr.Image(
+                    image_mode="L",
+                    source="upload",
+                    shape=(512, 512),
+                )
+                mask_outsides2 = gr.Checkbox(
+                    label="Mask other areas",
+                    value=False,
+                )
+                strength2 = gr.Slider(
+                    label="Token strength",
+                    minimum=0,
+                    maximum=0.8,
+                    step=0.01,
+                    value=0.5,
+                )
+                apply_style = gr.Button(value="Apply")
+                apply_style.click(
+                    apply_image,
+                    inputs=[sp2, radio, width, height, strength2, mask_outsides2, global_stats],
+                    outputs=[global_stats, rendered],
+                    queue=False,
+                )
+            width.change(
+                apply_new_res,
+                inputs=[width, height, global_stats],
+                outputs=[global_stats, rendered],
+                queue=False,
+            )
+            height.change(
+                apply_new_res,
+                inputs=[width, height, global_stats],
+                outputs=[global_stats, rendered],
+                queue=False,
+            )
+    # color_stats = gr.State(value={})
+    # text.change(detect_color, inputs=[sp, text, color_stats], outputs=[color_stats, rendered])
+    # sp.change(detect_color, inputs=[sp, text, color_stats], outputs=[color_stats, rendered])
+    inputs = [
+        prompt,
+        guidance,
+        steps,
+        width,
+        height,
+        seed,
+        neg_prompt,
+        global_stats,
+        g_strength,
+        inf_image,
+        inf_strength,
+        hr_enabled,
+        hr_method,
+        hr_scale,
+        hr_denoise,
+        sampler,
+        ti_state,
+        model,
+        lora_state,
+        lora_scale,
+    ]
+    outputs = [image_out]
+    prompt.submit(inference, inputs=inputs, outputs=outputs)
+    generate.click(inference, inputs=inputs, outputs=outputs)
+print(f"Space built in {time.time() - start_time:.2f} seconds")
+# demo.launch(share=True)
+demo.launch(enable_queue=True, server_name="0.0.0.0", server_port=7860)

modules/lora.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# LoRA network module
+# reference:
+# https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
+# https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+# https://github.com/bmaltais/kohya_ss/blob/master/networks/lora.py#L48
+import math
+import os
+import torch
+import modules.safe as _
+from safetensors.torch import load_file
+class LoRAModule(torch.nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+    def __init__(
+            self,
+            lora_name,
+            org_module: torch.nn.Module,
+            multiplier=1.0,
+            lora_dim=4,
+            alpha=1,
+    ):
+        """if alpha == 0 or None, alpha is rank (no scaling)."""
+        super().__init__()
+        self.lora_name = lora_name
+        self.lora_dim = lora_dim
+        if org_module.__class__.__name__ == "Conv2d":
+            in_dim = org_module.in_channels
+            out_dim = org_module.out_channels
+            self.lora_down = torch.nn.Conv2d(in_dim, lora_dim, (1, 1), bias=False)
+            self.lora_up = torch.nn.Conv2d(lora_dim, out_dim, (1, 1), bias=False)
+        else:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+            self.lora_down = torch.nn.Linear(in_dim, lora_dim, bias=False)
+            self.lora_up = torch.nn.Linear(lora_dim, out_dim, bias=False)
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().float().numpy()  # without casting, bf16 causes error
+        alpha = lora_dim if alpha is None or alpha == 0 else alpha
+        self.scale = alpha / self.lora_dim
+        self.register_buffer("alpha", torch.tensor(alpha))  # 定数として扱える
+        # same as microsoft's
+        torch.nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+        torch.nn.init.zeros_(self.lora_up.weight)
+        self.multiplier = multiplier
+        self.org_module = org_module  # remove in applying
+        self.enable = False
+    def resize(self, rank, alpha, multiplier):
+        self.alpha = torch.tensor(alpha)
+        self.multiplier = multiplier
+        self.scale = alpha / rank
+        if self.lora_down.__class__.__name__ == "Conv2d":
+            in_dim = self.lora_down.in_channels
+            out_dim = self.lora_up.out_channels
+            self.lora_down = torch.nn.Conv2d(in_dim, rank, (1, 1), bias=False)
+            self.lora_up = torch.nn.Conv2d(rank, out_dim, (1, 1), bias=False)
+        else:
+            in_dim = self.lora_down.in_features
+            out_dim = self.lora_up.out_features
+            self.lora_down = torch.nn.Linear(in_dim, rank, bias=False)
+            self.lora_up = torch.nn.Linear(rank, out_dim, bias=False)
+    def apply(self):
+        if hasattr(self, "org_module"):
+            self.org_forward = self.org_module.forward
+            self.org_module.forward = self.forward
+            del self.org_module
+    def forward(self, x):
+        if self.enable:
+            return (
+        self.org_forward(x)
+        + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+        )
+        return self.org_forward(x)
+class LoRANetwork(torch.nn.Module):
+    UNET_TARGET_REPLACE_MODULE = ["Transformer2DModel", "Attention"]
+    TEXT_ENCODER_TARGET_REPLACE_MODULE = ["CLIPAttention", "CLIPMLP"]
+    LORA_PREFIX_UNET = "lora_unet"
+    LORA_PREFIX_TEXT_ENCODER = "lora_te"
+    def __init__(self, text_encoder, unet, multiplier=1.0, lora_dim=4, alpha=1) -> None:
+        super().__init__()
+        self.multiplier = multiplier
+        self.lora_dim = lora_dim
+        self.alpha = alpha
+        # create module instances
+        def create_modules(prefix, root_module: torch.nn.Module, target_replace_modules):
+            loras = []
+            for name, module in root_module.named_modules():
+                if module.__class__.__name__ in target_replace_modules:
+                    for child_name, child_module in module.named_modules():
+                        if child_module.__class__.__name__ == "Linear" or (child_module.__class__.__name__ == "Conv2d" and child_module.kernel_size == (1, 1)):
+                            lora_name = prefix + "." + name + "." + child_name
+                            lora_name = lora_name.replace(".", "_")
+                            lora = LoRAModule(lora_name, child_module, self.multiplier, self.lora_dim, self.alpha,)
+                            loras.append(lora)
+            return loras
+        if isinstance(text_encoder, list):
+            self.text_encoder_loras = text_encoder
+        else:
+            self.text_encoder_loras = create_modules(LoRANetwork.LORA_PREFIX_TEXT_ENCODER, text_encoder, LoRANetwork.TEXT_ENCODER_TARGET_REPLACE_MODULE)
+            print(f"Create LoRA for Text Encoder: {len(self.text_encoder_loras)} modules.")
+        self.unet_loras = create_modules(LoRANetwork.LORA_PREFIX_UNET, unet, LoRANetwork.UNET_TARGET_REPLACE_MODULE)
+        print(f"Create LoRA for U-Net: {len(self.unet_loras)} modules.")
+        self.weights_sd = None
+        # assertion
+        names = set()
+        for lora in self.text_encoder_loras + self.unet_loras:
+            assert (lora.lora_name not in names), f"duplicated lora name: {lora.lora_name}"
+            names.add(lora.lora_name)
+            lora.apply()
+            self.add_module(lora.lora_name, lora)
+    def reset(self):
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.enable = False
+    def load(self, file, scale):
+        weights = None
+        if os.path.splitext(file)[1] == ".safetensors":
+            weights = load_file(file)
+        else:
+            weights = torch.load(file, map_location="cpu")
+        if not weights:
+            return
+        network_alpha = None
+        network_dim = None
+        for key, value in weights.items():
+            if network_alpha is None and "alpha" in key:
+                network_alpha = value
+            if network_dim is None and "lora_down" in key and len(value.size()) == 2:
+                network_dim = value.size()[0]
+        if network_alpha is None:
+            network_alpha = network_dim
+        weights_has_text_encoder = weights_has_unet = False
+        weights_to_modify = []
+        for key in weights.keys():
+            if key.startswith(LoRANetwork.LORA_PREFIX_TEXT_ENCODER):
+                weights_has_text_encoder = True
+            if key.startswith(LoRANetwork.LORA_PREFIX_UNET):
+                weights_has_unet = True
+        if weights_has_text_encoder:
+            weights_to_modify += self.text_encoder_loras
+        if weights_has_unet:
+            weights_to_modify += self.unet_loras
+        for lora in self.text_encoder_loras + self.unet_loras:
+            lora.resize(network_dim, network_alpha, scale)
+            if lora in weights_to_modify:
+                lora.enable = True
+        info = self.load_state_dict(weights, False)
+        if len(info.unexpected_keys) > 0:
+            print(f"Weights are loaded. Unexpected keys={info.unexpected_keys}")

modules/model.py ADDED Viewed

	@@ -0,0 +1,897 @@

+import importlib
+import inspect
+import math
+from pathlib import Path
+import re
+from collections import defaultdict
+from typing import List, Optional, Union
+import time
+import k_diffusion
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser
+from modules.prompt_parser import FrozenCLIPEmbedderWithCustomWords
+from torch import einsum
+from torch.autograd.function import Function
+from diffusers import DiffusionPipeline
+from diffusers.utils import PIL_INTERPOLATION, is_accelerate_available
+from diffusers.utils import logging, randn_tensor
+import modules.safe as _
+from safetensors.torch import load_file
+xformers_available = False
+try:
+    import xformers
+    xformers_available = True
+except ImportError:
+    pass
+EPSILON = 1e-6
+exists = lambda val: val is not None
+default = lambda val, d: val if exists(val) else d
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def get_attention_scores(attn, query, key, attention_mask=None):
+    if attn.upcast_attention:
+        query = query.float()
+        key = key.float()
+    attention_scores = torch.baddbmm(
+        torch.empty(
+            query.shape[0],
+            query.shape[1],
+            key.shape[1],
+            dtype=query.dtype,
+            device=query.device,
+        ),
+        query,
+        key.transpose(-1, -2),
+        beta=0,
+        alpha=attn.scale,
+    )
+    if attention_mask is not None:
+        attention_scores = attention_scores + attention_mask
+    if attn.upcast_softmax:
+        attention_scores = attention_scores.float()
+    return attention_scores
+class CrossAttnProcessor(nn.Module):
+    def __call__(
+        self,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+    ):
+        batch_size, sequence_length, _ = hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length)
+        encoder_states = hidden_states
+        is_xattn = False
+        if encoder_hidden_states is not None:
+            is_xattn = True
+            img_state = encoder_hidden_states["img_state"]
+            encoder_states = encoder_hidden_states["states"]
+            weight_func = encoder_hidden_states["weight_func"]
+            sigma = encoder_hidden_states["sigma"]
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_states)
+        value = attn.to_v(encoder_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        if is_xattn and isinstance(img_state, dict):
+            # use torch.baddbmm method (slow)
+            attention_scores = get_attention_scores(attn, query, key, attention_mask)
+            w = img_state[sequence_length].to(query.device)
+            cross_attention_weight = weight_func(w, sigma, attention_scores)
+            attention_scores += torch.repeat_interleave(
+                cross_attention_weight, repeats=attn.heads, dim=0
+            )
+            # calc probs
+            attention_probs = attention_scores.softmax(dim=-1)
+            attention_probs = attention_probs.to(query.dtype)
+            hidden_states = torch.bmm(attention_probs, value)
+        elif xformers_available:
+            hidden_states = xformers.ops.memory_efficient_attention(
+                query.contiguous(),
+                key.contiguous(),
+                value.contiguous(),
+                attn_bias=attention_mask,
+            )
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            q_bucket_size = 512
+            k_bucket_size = 1024
+            # use flash-attention
+            hidden_states = FlashAttentionFunction.apply(
+                query.contiguous(),
+                key.contiguous(),
+                value.contiguous(),
+                attention_mask,
+                False,
+                q_bucket_size,
+                k_bucket_size,
+            )
+            hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+class ModelWrapper:
+    def __init__(self, model, alphas_cumprod):
+        self.model = model
+        self.alphas_cumprod = alphas_cumprod
+    def apply_model(self, *args, **kwargs):
+        if len(args) == 3:
+            encoder_hidden_states = args[-1]
+            args = args[:2]
+        if kwargs.get("cond", None) is not None:
+            encoder_hidden_states = kwargs.pop("cond")
+        return self.model(
+            *args, encoder_hidden_states=encoder_hidden_states, **kwargs
+        ).sample
+class StableDiffusionPipeline(DiffusionPipeline):
+    _optional_components = ["safety_checker", "feature_extractor"]
+    def __init__(
+        self,
+        vae,
+        text_encoder,
+        tokenizer,
+        unet,
+        scheduler,
+    ):
+        super().__init__()
+        # get correct sigmas from LMS
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.setup_unet(self.unet)
+        self.setup_text_encoder()
+    def setup_text_encoder(self, n=1, new_encoder=None):
+        if new_encoder is not None:
+            self.text_encoder = new_encoder
+        self.prompt_parser = FrozenCLIPEmbedderWithCustomWords(self.tokenizer, self.text_encoder)
+        self.prompt_parser.CLIP_stop_at_last_layers = n
+    def setup_unet(self, unet):
+        unet = unet.to(self.device)
+        model = ModelWrapper(unet, self.scheduler.alphas_cumprod)
+        if self.scheduler.prediction_type == "v_prediction":
+            self.k_diffusion_model = CompVisVDenoiser(model)
+        else:
+            self.k_diffusion_model = CompVisDenoiser(model)
+    def get_scheduler(self, scheduler_type: str):
+        library = importlib.import_module("k_diffusion")
+        sampling = getattr(library, "sampling")
+        return getattr(sampling, scheduler_type)
+    def encode_sketchs(self, state, scale_ratio=8, g_strength=1.0, text_ids=None):
+        uncond, cond = text_ids[0], text_ids[1]
+        img_state = []
+        if state is None:
+            return torch.FloatTensor(0)
+        for k, v in state.items():
+            if v["map"] is None:
+                continue
+            v_input = self.tokenizer(
+                k,
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                add_special_tokens=False,
+            ).input_ids
+            dotmap = v["map"] < 255
+            out = dotmap.astype(float)
+            if v["mask_outsides"]:
+                out[out==0] = -1
+            arr = torch.from_numpy(
+                out * float(v["weight"]) * g_strength
+            )
+            img_state.append((v_input, arr))
+        if len(img_state) == 0:
+            return torch.FloatTensor(0)
+        w_tensors = dict()
+        cond = cond.tolist()
+        uncond = uncond.tolist()
+        for layer in self.unet.down_blocks:
+            c = int(len(cond))
+            w, h = img_state[0][1].shape
+            w_r, h_r = w // scale_ratio, h // scale_ratio
+            ret_cond_tensor = torch.zeros((1, int(w_r * h_r), c), dtype=torch.float32)
+            ret_uncond_tensor = torch.zeros((1, int(w_r * h_r), c), dtype=torch.float32)
+            for v_as_tokens, img_where_color in img_state:
+                is_in = 0
+                ret = (
+                    F.interpolate(
+                        img_where_color.unsqueeze(0).unsqueeze(1),
+                        scale_factor=1 / scale_ratio,
+                        mode="bilinear",
+                        align_corners=True,
+                    )
+                    .squeeze()
+                    .reshape(-1, 1)
+                    .repeat(1, len(v_as_tokens))
+                )
+                for idx, tok in enumerate(cond):
+                    if cond[idx : idx + len(v_as_tokens)] == v_as_tokens:
+                        is_in = 1
+                        ret_cond_tensor[0, :, idx : idx + len(v_as_tokens)] += ret
+                for idx, tok in enumerate(uncond):
+                    if uncond[idx : idx + len(v_as_tokens)] == v_as_tokens:
+                        is_in = 1
+                        ret_uncond_tensor[0, :, idx : idx + len(v_as_tokens)] += ret
+                if not is_in == 1:
+                    print(f"tokens {v_as_tokens} not found in text")
+            w_tensors[w_r * h_r] = torch.cat([ret_uncond_tensor, ret_cond_tensor])
+            scale_ratio *= 2
+        return w_tensors
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        r"""
+        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
+        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
+        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+        device = torch.device(f"cuda:{gpu_id}")
+        for cpu_offloaded_model in [
+            self.unet,
+            self.text_encoder,
+            self.vae,
+            self.safety_checker,
+        ]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+    @property
+    def _execution_device(self):
+        r"""
+        Returns the device on which the pipeline's models will be executed. After calling
+        `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module
+        hooks.
+        """
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def decode_latents(self, latents):
+        latents = latents.to(self.device, dtype=self.vae.dtype)
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+        if (callback_steps is None) or (
+            callback_steps is not None
+            and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (batch_size, num_channels_latents, height // 8, width // 8)
+        if latents is None:
+            if device.type == "mps":
+                # randn does not work reproducibly on mps
+                latents = torch.randn(
+                    shape, generator=generator, device="cpu", dtype=dtype
+                ).to(device)
+            else:
+                latents = torch.randn(
+                    shape, generator=generator, device=device, dtype=dtype
+                )
+        else:
+            # if latents.shape != shape:
+            #     raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        return latents
+    def preprocess(self, image):
+        if isinstance(image, torch.Tensor):
+            return image
+        elif isinstance(image, PIL.Image.Image):
+            image = [image]
+        if isinstance(image[0], PIL.Image.Image):
+            w, h = image[0].size
+            w, h = map(lambda x: x - x % 8, (w, h))  # resize to integer multiple of 8
+            image = [
+                np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[
+                    None, :
+                ]
+                for i in image
+            ]
+            image = np.concatenate(image, axis=0)
+            image = np.array(image).astype(np.float32) / 255.0
+            image = image.transpose(0, 3, 1, 2)
+            image = 2.0 * image - 1.0
+            image = torch.from_numpy(image)
+        elif isinstance(image[0], torch.Tensor):
+            image = torch.cat(image, dim=0)
+        return image
+    @torch.no_grad()
+    def img2img(
+        self,
+        prompt: Union[str, List[str]],
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        generator: Optional[torch.Generator] = None,
+        image: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        latents=None,
+        strength=1.0,
+        pww_state=None,
+        pww_attn_weight=1.0,
+        sampler_name="",
+        sampler_opt={},
+        start_time=-1,
+        timeout=180,
+        scale_ratio=8.0,
+    ):
+        sampler = self.get_scheduler(sampler_name)
+        if image is not None:
+            image = self.preprocess(image)
+            image = image.to(self.vae.device, dtype=self.vae.dtype)
+            init_latents = self.vae.encode(image).latent_dist.sample(generator)
+            latents = 0.18215 * init_latents
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        latents = latents.to(device, dtype=self.unet.dtype)
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = True
+        if guidance_scale <= 1.0:
+            raise ValueError("has to use guidance_scale")
+        # 3. Encode input prompt
+        text_ids, text_embeddings = self.prompt_parser([negative_prompt, prompt])
+        text_embeddings = text_embeddings.to(self.unet.dtype)
+        init_timestep = (
+            int(num_inference_steps / min(strength, 0.999)) if strength > 0 else 0
+        )
+        sigmas = self.get_sigmas(init_timestep, sampler_opt).to(
+            text_embeddings.device, dtype=text_embeddings.dtype
+        )
+        t_start = max(init_timestep - num_inference_steps, 0)
+        sigma_sched = sigmas[t_start:]
+        noise = randn_tensor(
+            latents.shape,
+            generator=generator,
+            device=device,
+            dtype=text_embeddings.dtype,
+        )
+        latents = latents.to(device)
+        latents = latents + noise * sigma_sched[0]
+        # 5. Prepare latent variables
+        self.k_diffusion_model.sigmas = self.k_diffusion_model.sigmas.to(latents.device)
+        self.k_diffusion_model.log_sigmas = self.k_diffusion_model.log_sigmas.to(
+            latents.device
+        )
+        img_state = self.encode_sketchs(
+            pww_state,
+            g_strength=pww_attn_weight,
+            text_ids=text_ids,
+        )
+        def model_fn(x, sigma):
+            if start_time > 0 and timeout > 0:
+                assert (time.time() - start_time) < timeout, "inference process timed out"
+            latent_model_input = torch.cat([x] * 2)
+            weight_func = lambda w, sigma, qk: w * math.log(1 + sigma) * qk.max()
+            encoder_state = {
+                "img_state": img_state,
+                "states": text_embeddings,
+                "sigma": sigma[0],
+                "weight_func": weight_func,
+            }
+            noise_pred = self.k_diffusion_model(
+                latent_model_input, sigma, cond=encoder_state
+            )
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+            return noise_pred
+        sampler_args = self.get_sampler_extra_args_i2i(sigma_sched, sampler)
+        latents = sampler(model_fn, latents, **sampler_args)
+        # 8. Post-processing
+        image = self.decode_latents(latents)
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        return (image,)
+    def get_sigmas(self, steps, params):
+        discard_next_to_last_sigma = params.get("discard_next_to_last_sigma", False)
+        steps += 1 if discard_next_to_last_sigma else 0
+        if params.get("scheduler", None) == "karras":
+            sigma_min, sigma_max = (
+                self.k_diffusion_model.sigmas[0].item(),
+                self.k_diffusion_model.sigmas[-1].item(),
+            )
+            sigmas = k_diffusion.sampling.get_sigmas_karras(
+                n=steps, sigma_min=sigma_min, sigma_max=sigma_max, device=self.device
+            )
+        else:
+            sigmas = self.k_diffusion_model.get_sigmas(steps)
+        if discard_next_to_last_sigma:
+            sigmas = torch.cat([sigmas[:-2], sigmas[-1:]])
+        return sigmas
+    # https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/48a15821de768fea76e66f26df83df3fddf18f4b/modules/sd_samplers.py#L454
+    def get_sampler_extra_args_t2i(self, sigmas, eta, steps, func):
+        extra_params_kwargs = {}
+        if "eta" in inspect.signature(func).parameters:
+            extra_params_kwargs["eta"] = eta
+        if "sigma_min" in inspect.signature(func).parameters:
+            extra_params_kwargs["sigma_min"] = sigmas[0].item()
+            extra_params_kwargs["sigma_max"] = sigmas[-1].item()
+        if "n" in inspect.signature(func).parameters:
+            extra_params_kwargs["n"] = steps
+        else:
+            extra_params_kwargs["sigmas"] = sigmas
+        return extra_params_kwargs
+    # https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/48a15821de768fea76e66f26df83df3fddf18f4b/modules/sd_samplers.py#L454
+    def get_sampler_extra_args_i2i(self, sigmas, func):
+        extra_params_kwargs = {}
+        if "sigma_min" in inspect.signature(func).parameters:
+            ## last sigma is zero which isn't allowed by DPM Fast & Adaptive so taking value before last
+            extra_params_kwargs["sigma_min"] = sigmas[-2]
+        if "sigma_max" in inspect.signature(func).parameters:
+            extra_params_kwargs["sigma_max"] = sigmas[0]
+        if "n" in inspect.signature(func).parameters:
+            extra_params_kwargs["n"] = len(sigmas) - 1
+        if "sigma_sched" in inspect.signature(func).parameters:
+            extra_params_kwargs["sigma_sched"] = sigmas
+        if "sigmas" in inspect.signature(func).parameters:
+            extra_params_kwargs["sigmas"] = sigmas
+        return extra_params_kwargs
+    @torch.no_grad()
+    def txt2img(
+        self,
+        prompt: Union[str, List[str]],
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_steps: Optional[int] = 1,
+        upscale=False,
+        upscale_x: float = 2.0,
+        upscale_method: str = "bicubic",
+        upscale_antialias: bool = False,
+        upscale_denoising_strength: int = 0.7,
+        pww_state=None,
+        pww_attn_weight=1.0,
+        sampler_name="",
+        sampler_opt={},
+        start_time=-1,
+        timeout=180,
+    ):
+        sampler = self.get_scheduler(sampler_name)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+        # 2. Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = True
+        if guidance_scale <= 1.0:
+            raise ValueError("has to use guidance_scale")
+        # 3. Encode input prompt
+        text_ids, text_embeddings = self.prompt_parser([negative_prompt, prompt])
+        text_embeddings = text_embeddings.to(self.unet.dtype)
+        # 4. Prepare timesteps
+        sigmas = self.get_sigmas(num_inference_steps, sampler_opt).to(
+            text_embeddings.device, dtype=text_embeddings.dtype
+        )
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents = latents * sigmas[0]
+        self.k_diffusion_model.sigmas = self.k_diffusion_model.sigmas.to(latents.device)
+        self.k_diffusion_model.log_sigmas = self.k_diffusion_model.log_sigmas.to(
+            latents.device
+        )
+        img_state = self.encode_sketchs(
+            pww_state,
+            g_strength=pww_attn_weight,
+            text_ids=text_ids,
+        )
+        def model_fn(x, sigma):
+            if start_time > 0 and timeout > 0:
+                assert (time.time() - start_time) < timeout, "inference process timed out"
+            latent_model_input = torch.cat([x] * 2)
+            weight_func = lambda w, sigma, qk: w * math.log(1 + sigma) * qk.max()
+            encoder_state = {
+                "img_state": img_state,
+                "states": text_embeddings,
+                "sigma": sigma[0],
+                "weight_func": weight_func,
+            }
+            noise_pred = self.k_diffusion_model(
+                latent_model_input, sigma, cond=encoder_state
+            )
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+            return noise_pred
+        extra_args = self.get_sampler_extra_args_t2i(
+            sigmas, eta, num_inference_steps, sampler
+        )
+        latents = sampler(model_fn, latents, **extra_args)
+        if upscale:
+            target_height = height * upscale_x
+            target_width = width * upscale_x
+            vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+            latents = torch.nn.functional.interpolate(
+                latents,
+                size=(
+                    int(target_height // vae_scale_factor),
+                    int(target_width // vae_scale_factor),
+                ),
+                mode=upscale_method,
+                antialias=upscale_antialias,
+            )
+            return self.img2img(
+                prompt=prompt,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=guidance_scale,
+                negative_prompt=negative_prompt,
+                generator=generator,
+                latents=latents,
+                strength=upscale_denoising_strength,
+                sampler_name=sampler_name,
+                sampler_opt=sampler_opt,
+                pww_state=None,
+                pww_attn_weight=pww_attn_weight / 2,
+            )
+        # 8. Post-processing
+        image = self.decode_latents(latents)
+        # 10. Convert to PIL
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        return (image,)
+class FlashAttentionFunction(Function):
+    @staticmethod
+    @torch.no_grad()
+    def forward(ctx, q, k, v, mask, causal, q_bucket_size, k_bucket_size):
+        """Algorithm 2 in the paper"""
+        device = q.device
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+        o = torch.zeros_like(q)
+        all_row_sums = torch.zeros((*q.shape[:-1], 1), device=device)
+        all_row_maxes = torch.full((*q.shape[:-1], 1), max_neg_value, device=device)
+        scale = q.shape[-1] ** -0.5
+        if not exists(mask):
+            mask = (None,) * math.ceil(q.shape[-2] / q_bucket_size)
+        else:
+            mask = rearrange(mask, "b n -> b 1 1 n")
+            mask = mask.split(q_bucket_size, dim=-1)
+        row_splits = zip(
+            q.split(q_bucket_size, dim=-2),
+            o.split(q_bucket_size, dim=-2),
+            mask,
+            all_row_sums.split(q_bucket_size, dim=-2),
+            all_row_maxes.split(q_bucket_size, dim=-2),
+        )
+        for ind, (qc, oc, row_mask, row_sums, row_maxes) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+            col_splits = zip(
+                k.split(k_bucket_size, dim=-2),
+                v.split(k_bucket_size, dim=-2),
+            )
+            for k_ind, (kc, vc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+                attn_weights = einsum("... i d, ... j d -> ... i j", qc, kc) * scale
+                if exists(row_mask):
+                    attn_weights.masked_fill_(~row_mask, max_neg_value)
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones(
+                        (qc.shape[-2], kc.shape[-2]), dtype=torch.bool, device=device
+                    ).triu(q_start_index - k_start_index + 1)
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+                block_row_maxes = attn_weights.amax(dim=-1, keepdims=True)
+                attn_weights -= block_row_maxes
+                exp_weights = torch.exp(attn_weights)
+                if exists(row_mask):
+                    exp_weights.masked_fill_(~row_mask, 0.0)
+                block_row_sums = exp_weights.sum(dim=-1, keepdims=True).clamp(
+                    min=EPSILON
+                )
+                new_row_maxes = torch.maximum(block_row_maxes, row_maxes)
+                exp_values = einsum("... i j, ... j d -> ... i d", exp_weights, vc)
+                exp_row_max_diff = torch.exp(row_maxes - new_row_maxes)
+                exp_block_row_max_diff = torch.exp(block_row_maxes - new_row_maxes)
+                new_row_sums = (
+                    exp_row_max_diff * row_sums
+                    + exp_block_row_max_diff * block_row_sums
+                )
+                oc.mul_((row_sums / new_row_sums) * exp_row_max_diff).add_(
+                    (exp_block_row_max_diff / new_row_sums) * exp_values
+                )
+                row_maxes.copy_(new_row_maxes)
+                row_sums.copy_(new_row_sums)
+        lse = all_row_sums.log() + all_row_maxes
+        ctx.args = (causal, scale, mask, q_bucket_size, k_bucket_size)
+        ctx.save_for_backward(q, k, v, o, lse)
+        return o
+    @staticmethod
+    @torch.no_grad()
+    def backward(ctx, do):
+        """Algorithm 4 in the paper"""
+        causal, scale, mask, q_bucket_size, k_bucket_size = ctx.args
+        q, k, v, o, lse = ctx.saved_tensors
+        device = q.device
+        max_neg_value = -torch.finfo(q.dtype).max
+        qk_len_diff = max(k.shape[-2] - q.shape[-2], 0)
+        dq = torch.zeros_like(q)
+        dk = torch.zeros_like(k)
+        dv = torch.zeros_like(v)
+        row_splits = zip(
+            q.split(q_bucket_size, dim=-2),
+            o.split(q_bucket_size, dim=-2),
+            do.split(q_bucket_size, dim=-2),
+            mask,
+            lse.split(q_bucket_size, dim=-2),
+            dq.split(q_bucket_size, dim=-2),
+        )
+        for ind, (qc, oc, doc, row_mask, lsec, dqc) in enumerate(row_splits):
+            q_start_index = ind * q_bucket_size - qk_len_diff
+            col_splits = zip(
+                k.split(k_bucket_size, dim=-2),
+                v.split(k_bucket_size, dim=-2),
+                dk.split(k_bucket_size, dim=-2),
+                dv.split(k_bucket_size, dim=-2),
+            )
+            for k_ind, (kc, vc, dkc, dvc) in enumerate(col_splits):
+                k_start_index = k_ind * k_bucket_size
+                attn_weights = einsum("... i d, ... j d -> ... i j", qc, kc) * scale
+                if causal and q_start_index < (k_start_index + k_bucket_size - 1):
+                    causal_mask = torch.ones(
+                        (qc.shape[-2], kc.shape[-2]), dtype=torch.bool, device=device
+                    ).triu(q_start_index - k_start_index + 1)
+                    attn_weights.masked_fill_(causal_mask, max_neg_value)
+                p = torch.exp(attn_weights - lsec)
+                if exists(row_mask):
+                    p.masked_fill_(~row_mask, 0.0)
+                dv_chunk = einsum("... i j, ... i d -> ... j d", p, doc)
+                dp = einsum("... i d, ... j d -> ... i j", doc, vc)
+                D = (doc * oc).sum(dim=-1, keepdims=True)
+                ds = p * scale * (dp - D)
+                dq_chunk = einsum("... i j, ... j d -> ... i d", ds, kc)
+                dk_chunk = einsum("... i j, ... i d -> ... j d", ds, qc)
+                dqc.add_(dq_chunk)
+                dkc.add_(dk_chunk)
+                dvc.add_(dv_chunk)
+        return dq, dk, dv, None, None, None, None

modules/prompt_parser.py ADDED Viewed

	@@ -0,0 +1,391 @@

+import re
+import math
+import numpy as np
+import torch
+# Code from https://github.com/AUTOMATIC1111/stable-diffusion-webui/commit/8e2aeee4a127b295bfc880800e4a312e0f049b85, modified.
+class PromptChunk:
+    """
+    This object contains token ids, weight (multipliers:1.4) and textual inversion embedding info for a chunk of prompt.
+    If a prompt is short, it is represented by one PromptChunk, otherwise, multiple are necessary.
+    Each PromptChunk contains an exact amount of tokens - 77, which includes one for start and end token,
+    so just 75 tokens from prompt.
+    """
+    def __init__(self):
+        self.tokens = []
+        self.multipliers = []
+        self.fixes = []
+class FrozenCLIPEmbedderWithCustomWordsBase(torch.nn.Module):
+    """A pytorch module that is a wrapper for FrozenCLIPEmbedder module. it enhances FrozenCLIPEmbedder, making it possible to
+    have unlimited prompt length and assign weights to tokens in prompt.
+    """
+    def __init__(self, text_encoder, enable_emphasis=True):
+        super().__init__()
+        self.device = lambda: text_encoder.device
+        self.enable_emphasis = enable_emphasis
+        """Original FrozenCLIPEmbedder module; can also be FrozenOpenCLIPEmbedder or xlmr.BertSeriesModelWithTransformation,
+        depending on model."""
+        self.chunk_length = 75
+    def empty_chunk(self):
+        """creates an empty PromptChunk and returns it"""
+        chunk = PromptChunk()
+        chunk.tokens = [self.id_start] + [self.id_end] * (self.chunk_length + 1)
+        chunk.multipliers = [1.0] * (self.chunk_length + 2)
+        return chunk
+    def get_target_prompt_token_count(self, token_count):
+        """returns the maximum number of tokens a prompt of a known length can have before it requires one more PromptChunk to be represented"""
+        return math.ceil(max(token_count, 1) / self.chunk_length) * self.chunk_length
+    def tokenize_line(self, line):
+        """
+        this transforms a single prompt into a list of PromptChunk objects - as many as needed to
+        represent the prompt.
+        Returns the list and the total number of tokens in the prompt.
+        """
+        if self.enable_emphasis:
+            parsed = parse_prompt_attention(line)
+        else:
+            parsed = [[line, 1.0]]
+        tokenized = self.tokenize([text for text, _ in parsed])
+        chunks = []
+        chunk = PromptChunk()
+        token_count = 0
+        last_comma = -1
+        def next_chunk(is_last=False):
+            """puts current chunk into the list of results and produces the next one - empty;
+            if is_last is true, tokens <end-of-text> tokens at the end won't add to token_count"""
+            nonlocal token_count
+            nonlocal last_comma
+            nonlocal chunk
+            if is_last:
+                token_count += len(chunk.tokens)
+            else:
+                token_count += self.chunk_length
+            to_add = self.chunk_length - len(chunk.tokens)
+            if to_add > 0:
+                chunk.tokens += [self.id_end] * to_add
+                chunk.multipliers += [1.0] * to_add
+            chunk.tokens = [self.id_start] + chunk.tokens + [self.id_end]
+            chunk.multipliers = [1.0] + chunk.multipliers + [1.0]
+            last_comma = -1
+            chunks.append(chunk)
+            chunk = PromptChunk()
+        comma_padding_backtrack = 20  # default value in https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/6cff4401824299a983c8e13424018efc347b4a2b/modules/shared.py#L410
+        for tokens, (text, weight) in zip(tokenized, parsed):
+            if text == "BREAK" and weight == -1:
+                next_chunk()
+                continue
+            position = 0
+            while position < len(tokens):
+                token = tokens[position]
+                if token == self.comma_token:
+                    last_comma = len(chunk.tokens)
+                # this is when we are at the end of alloted 75 tokens for the current chunk, and the current token is not a comma. opts.comma_padding_backtrack
+                # is a setting that specifies that if there is a comma nearby, the text after the comma should be moved out of this chunk and into the next.
+                elif (
+                    comma_padding_backtrack != 0
+                    and len(chunk.tokens) == self.chunk_length
+                    and last_comma != -1
+                    and len(chunk.tokens) - last_comma <= comma_padding_backtrack
+                ):
+                    break_location = last_comma + 1
+                    reloc_tokens = chunk.tokens[break_location:]
+                    reloc_mults = chunk.multipliers[break_location:]
+                    chunk.tokens = chunk.tokens[:break_location]
+                    chunk.multipliers = chunk.multipliers[:break_location]
+                    next_chunk()
+                    chunk.tokens = reloc_tokens
+                    chunk.multipliers = reloc_mults
+                if len(chunk.tokens) == self.chunk_length:
+                    next_chunk()
+                chunk.tokens.append(token)
+                chunk.multipliers.append(weight)
+                position += 1
+        if len(chunk.tokens) > 0 or len(chunks) == 0:
+            next_chunk(is_last=True)
+        return chunks, token_count
+    def process_texts(self, texts):
+        """
+        Accepts a list of texts and calls tokenize_line() on each, with cache. Returns the list of results and maximum
+        length, in tokens, of all texts.
+        """
+        token_count = 0
+        cache = {}
+        batch_chunks = []
+        for line in texts:
+            if line in cache:
+                chunks = cache[line]
+            else:
+                chunks, current_token_count = self.tokenize_line(line)
+                token_count = max(current_token_count, token_count)
+                cache[line] = chunks
+            batch_chunks.append(chunks)
+        return batch_chunks, token_count
+    def forward(self, texts):
+        """
+        Accepts an array of texts; Passes texts through transformers network to create a tensor with numerical representation of those texts.
+        Returns a tensor with shape of (B, T, C), where B is length of the array; T is length, in tokens, of texts (including padding) - T will
+        be a multiple of 77; and C is dimensionality of each token - for SD1 it's 768, and for SD2 it's 1024.
+        An example shape returned by this function can be: (2, 77, 768).
+        Webui usually sends just one text at a time through this function - the only time when texts is an array with more than one elemenet
+        is when you do prompt editing: "a picture of a [cat:dog:0.4] eating ice cream"
+        """
+        batch_chunks, token_count = self.process_texts(texts)
+        chunk_count = max([len(x) for x in batch_chunks])
+        zs = []
+        ts = []
+        for i in range(chunk_count):
+            batch_chunk = [
+                chunks[i] if i < len(chunks) else self.empty_chunk()
+                for chunks in batch_chunks
+            ]
+            tokens = [x.tokens for x in batch_chunk]
+            multipliers = [x.multipliers for x in batch_chunk]
+            # self.embeddings.fixes = [x.fixes for x in batch_chunk]
+            # for fixes in self.embeddings.fixes:
+            #     for position, embedding in fixes:
+            #         used_embeddings[embedding.name] = embedding
+            z = self.process_tokens(tokens, multipliers)
+            zs.append(z)
+            ts.append(tokens)
+        return np.hstack(ts), torch.hstack(zs)
+    def process_tokens(self, remade_batch_tokens, batch_multipliers):
+        """
+        sends one single prompt chunk to be encoded by transformers neural network.
+        remade_batch_tokens is a batch of tokens - a list, where every element is a list of tokens; usually
+        there are exactly 77 tokens in the list. batch_multipliers is the same but for multipliers instead of tokens.
+        Multipliers are used to give more or less weight to the outputs of transformers network. Each multiplier
+        corresponds to one token.
+        """
+        tokens = torch.asarray(remade_batch_tokens).to(self.device())
+        # this is for SD2: SD1 uses the same token for padding and end of text, while SD2 uses different ones.
+        if self.id_end != self.id_pad:
+            for batch_pos in range(len(remade_batch_tokens)):
+                index = remade_batch_tokens[batch_pos].index(self.id_end)
+                tokens[batch_pos, index + 1 : tokens.shape[1]] = self.id_pad
+        z = self.encode_with_transformers(tokens)
+        # restoring original mean is likely not correct, but it seems to work well to prevent artifacts that happen otherwise
+        batch_multipliers = torch.asarray(batch_multipliers).to(self.device())
+        original_mean = z.mean()
+        z = z * batch_multipliers.reshape(batch_multipliers.shape + (1,)).expand(z.shape)
+        new_mean = z.mean()
+        z = z * (original_mean / new_mean)
+        return z
+class FrozenCLIPEmbedderWithCustomWords(FrozenCLIPEmbedderWithCustomWordsBase):
+    def __init__(self, tokenizer, text_encoder):
+        super().__init__(text_encoder)
+        self.tokenizer = tokenizer
+        self.text_encoder = text_encoder
+        vocab = self.tokenizer.get_vocab()
+        self.comma_token = vocab.get(",</w>", None)
+        self.token_mults = {}
+        tokens_with_parens = [
+            (k, v)
+            for k, v in vocab.items()
+            if "(" in k or ")" in k or "[" in k or "]" in k
+        ]
+        for text, ident in tokens_with_parens:
+            mult = 1.0
+            for c in text:
+                if c == "[":
+                    mult /= 1.1
+                if c == "]":
+                    mult *= 1.1
+                if c == "(":
+                    mult *= 1.1
+                if c == ")":
+                    mult /= 1.1
+            if mult != 1.0:
+                self.token_mults[ident] = mult
+        self.id_start = self.tokenizer.bos_token_id
+        self.id_end = self.tokenizer.eos_token_id
+        self.id_pad = self.id_end
+    def tokenize(self, texts):
+        tokenized = self.tokenizer(
+            texts, truncation=False, add_special_tokens=False
+        )["input_ids"]
+        return tokenized
+    def encode_with_transformers(self, tokens):
+        CLIP_stop_at_last_layers = 1
+        tokens = tokens.to(self.text_encoder.device)
+        outputs = self.text_encoder(tokens, output_hidden_states=True)
+        if CLIP_stop_at_last_layers > 1:
+            z = outputs.hidden_states[-CLIP_stop_at_last_layers]
+            z = self.text_encoder.text_model.final_layer_norm(z)
+        else:
+            z = outputs.last_hidden_state
+        return z
+re_attention = re.compile(
+    r"""
+\\\(|
+\\\)|
+\\\[|
+\\]|
+\\\\|
+\\|
+\(|
+\[|
+:([+-]?[.\d]+)\)|
+\)|
+]|
+[^\\()\[\]:]+|
+:
+""",
+    re.X,
+)
+re_break = re.compile(r"\s*\bBREAK\b\s*", re.S)
+def parse_prompt_attention(text):
+    """
+    Parses a string with attention tokens and returns a list of pairs: text and its associated weight.
+    Accepted tokens are:
+      (abc) - increases attention to abc by a multiplier of 1.1
+      (abc:3.12) - increases attention to abc by a multiplier of 3.12
+      [abc] - decreases attention to abc by a multiplier of 1.1
+      \( - literal character '('
+      \[ - literal character '['
+      \) - literal character ')'
+      \] - literal character ']'
+      \\ - literal character '\'
+      anything else - just text
+    >>> parse_prompt_attention('normal text')
+    [['normal text', 1.0]]
+    >>> parse_prompt_attention('an (important) word')
+    [['an ', 1.0], ['important', 1.1], [' word', 1.0]]
+    >>> parse_prompt_attention('(unbalanced')
+    [['unbalanced', 1.1]]
+    >>> parse_prompt_attention('\(literal\]')
+    [['(literal]', 1.0]]
+    >>> parse_prompt_attention('(unnecessary)(parens)')
+    [['unnecessaryparens', 1.1]]
+    >>> parse_prompt_attention('a (((house:1.3)) [on] a (hill:0.5), sun, (((sky))).')
+    [['a ', 1.0],
+     ['house', 1.5730000000000004],
+     [' ', 1.1],
+     ['on', 1.0],
+     [' a ', 1.1],
+     ['hill', 0.55],
+     [', sun, ', 1.1],
+     ['sky', 1.4641000000000006],
+     ['.', 1.1]]
+    """
+    res = []
+    round_brackets = []
+    square_brackets = []
+    round_bracket_multiplier = 1.1
+    square_bracket_multiplier = 1 / 1.1
+    def multiply_range(start_position, multiplier):
+        for p in range(start_position, len(res)):
+            res[p][1] *= multiplier
+    for m in re_attention.finditer(text):
+        text = m.group(0)
+        weight = m.group(1)
+        if text.startswith("\\"):
+            res.append([text[1:], 1.0])
+        elif text == "(":
+            round_brackets.append(len(res))
+        elif text == "[":
+            square_brackets.append(len(res))
+        elif weight is not None and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), float(weight))
+        elif text == ")" and len(round_brackets) > 0:
+            multiply_range(round_brackets.pop(), round_bracket_multiplier)
+        elif text == "]" and len(square_brackets) > 0:
+            multiply_range(square_brackets.pop(), square_bracket_multiplier)
+        else:
+            parts = re.split(re_break, text)
+            for i, part in enumerate(parts):
+                if i > 0:
+                    res.append(["BREAK", -1])
+                res.append([part, 1.0])
+    for pos in round_brackets:
+        multiply_range(pos, round_bracket_multiplier)
+    for pos in square_brackets:
+        multiply_range(pos, square_bracket_multiplier)
+    if len(res) == 0:
+        res = [["", 1.0]]
+    # merge runs of identical weights
+    i = 0
+    while i + 1 < len(res):
+        if res[i][1] == res[i + 1][1]:
+            res[i][0] += res[i + 1][0]
+            res.pop(i + 1)
+        else:
+            i += 1
+    return res

modules/safe.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# this code is adapted from the script contributed by anon from /h/
+# modified, from https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/6cff4401824299a983c8e13424018efc347b4a2b/modules/safe.py
+import io
+import pickle
+import collections
+import sys
+import traceback
+import torch
+import numpy
+import _codecs
+import zipfile
+import re
+# PyTorch 1.13 and later have _TypedStorage renamed to TypedStorage
+TypedStorage = torch.storage.TypedStorage if hasattr(torch.storage, 'TypedStorage') else torch.storage._TypedStorage
+def encode(*args):
+    out = _codecs.encode(*args)
+    return out
+class RestrictedUnpickler(pickle.Unpickler):
+    extra_handler = None
+    def persistent_load(self, saved_id):
+        assert saved_id[0] == 'storage'
+        return TypedStorage()
+    def find_class(self, module, name):
+        if self.extra_handler is not None:
+            res = self.extra_handler(module, name)
+            if res is not None:
+                return res
+        if module == 'collections' and name == 'OrderedDict':
+            return getattr(collections, name)
+        if module == 'torch._utils' and name in ['_rebuild_tensor_v2', '_rebuild_parameter', '_rebuild_device_tensor_from_numpy']:
+            return getattr(torch._utils, name)
+        if module == 'torch' and name in ['FloatStorage', 'HalfStorage', 'IntStorage', 'LongStorage', 'DoubleStorage', 'ByteStorage', 'float32']:
+            return getattr(torch, name)
+        if module == 'torch.nn.modules.container' and name in ['ParameterDict']:
+            return getattr(torch.nn.modules.container, name)
+        if module == 'numpy.core.multiarray' and name in ['scalar', '_reconstruct']:
+            return getattr(numpy.core.multiarray, name)
+        if module == 'numpy' and name in ['dtype', 'ndarray']:
+            return getattr(numpy, name)
+        if module == '_codecs' and name == 'encode':
+            return encode
+        if module == "pytorch_lightning.callbacks" and name == 'model_checkpoint':
+            import pytorch_lightning.callbacks
+            return pytorch_lightning.callbacks.model_checkpoint
+        if module == "pytorch_lightning.callbacks.model_checkpoint" and name == 'ModelCheckpoint':
+            import pytorch_lightning.callbacks.model_checkpoint
+            return pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
+        if module == "__builtin__" and name == 'set':
+            return set
+        # Forbid everything else.
+        raise Exception(f"global '{module}/{name}' is forbidden")
+# Regular expression that accepts 'dirname/version', 'dirname/data.pkl', and 'dirname/data/<number>'
+allowed_zip_names_re = re.compile(r"^([^/]+)/((data/\d+)|version|(data\.pkl))$")
+data_pkl_re = re.compile(r"^([^/]+)/data\.pkl$")
+def check_zip_filenames(filename, names):
+    for name in names:
+        if allowed_zip_names_re.match(name):
+            continue
+        raise Exception(f"bad file inside {filename}: {name}")
+def check_pt(filename, extra_handler):
+    try:
+        # new pytorch format is a zip file
+        with zipfile.ZipFile(filename) as z:
+            check_zip_filenames(filename, z.namelist())
+            # find filename of data.pkl in zip file: '<directory name>/data.pkl'
+            data_pkl_filenames = [f for f in z.namelist() if data_pkl_re.match(f)]
+            if len(data_pkl_filenames) == 0:
+                raise Exception(f"data.pkl not found in {filename}")
+            if len(data_pkl_filenames) > 1:
+                raise Exception(f"Multiple data.pkl found in {filename}")
+            with z.open(data_pkl_filenames[0]) as file:
+                unpickler = RestrictedUnpickler(file)
+                unpickler.extra_handler = extra_handler
+                unpickler.load()
+    except zipfile.BadZipfile:
+        # if it's not a zip file, it's an olf pytorch format, with five objects written to pickle
+        with open(filename, "rb") as file:
+            unpickler = RestrictedUnpickler(file)
+            unpickler.extra_handler = extra_handler
+            for i in range(5):
+                unpickler.load()
+def load(filename, *args, **kwargs):
+    return load_with_extra(filename, extra_handler=global_extra_handler, *args, **kwargs)
+def load_with_extra(filename, extra_handler=None, *args, **kwargs):
+    """
+    this function is intended to be used by extensions that want to load models with
+    some extra classes in them that the usual unpickler would find suspicious.
+    Use the extra_handler argument to specify a function that takes module and field name as text,
+    and returns that field's value:
+    ```python
+    def extra(module, name):
+        if module == 'collections' and name == 'OrderedDict':
+            return collections.OrderedDict
+        return None
+    safe.load_with_extra('model.pt', extra_handler=extra)
+    ```
+    The alternative to this is just to use safe.unsafe_torch_load('model.pt'), which as the name implies is
+    definitely unsafe.
+    """
+    try:
+        check_pt(filename, extra_handler)
+    except pickle.UnpicklingError:
+        print(f"Error verifying pickled file from {filename}:", file=sys.stderr)
+        print(traceback.format_exc(), file=sys.stderr)
+        print("The file is most likely corrupted.", file=sys.stderr)
+        return None
+    except Exception:
+        print(f"Error verifying pickled file from {filename}:", file=sys.stderr)
+        print(traceback.format_exc(), file=sys.stderr)
+        print("\nThe file may be malicious, so the program is not going to read it.", file=sys.stderr)
+        print("You can skip this check with --disable-safe-unpickle commandline argument.\n\n", file=sys.stderr)
+        return None
+    return unsafe_torch_load(filename, *args, **kwargs)
+class Extra:
+    """
+    A class for temporarily setting the global handler for when you can't explicitly call load_with_extra
+    (because it's not your code making the torch.load call). The intended use is like this:
+```
+import torch
+from modules import safe
+def handler(module, name):
+    if module == 'torch' and name in ['float64', 'float16']:
+        return getattr(torch, name)
+    return None
+with safe.Extra(handler):
+    x = torch.load('model.pt')
+```
+    """
+    def __init__(self, handler):
+        self.handler = handler
+    def __enter__(self):
+        global global_extra_handler
+        assert global_extra_handler is None, 'already inside an Extra() block'
+        global_extra_handler = self.handler
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        global global_extra_handler
+        global_extra_handler = None
+unsafe_torch_load = torch.load
+torch.load = load
+global_extra_handler = None