Spaces:

shaokun
/

training-playground

Runtime error

App Files Files Community

shaokun commited on Apr 12, 2023

Commit

a4d7b31

1 Parent(s): bfc9705

WIP

Browse files

Files changed (18) hide show

.gitignore +2 -0
app.py +113 -11
lora_diffusion/FOR-cloneofsimo-LoRA +6 -0
lora_diffusion/__init__.py +5 -0
lora_diffusion/cli_lora_add.py +187 -0
lora_diffusion/cli_lora_pti.py +1040 -0
lora_diffusion/cli_pt_to_safetensors.py +85 -0
lora_diffusion/cli_svd.py +146 -0
lora_diffusion/dataset.py +311 -0
lora_diffusion/lora.py +1110 -0
lora_diffusion/lora_manager.py +144 -0
lora_diffusion/preprocess_files.py +327 -0
lora_diffusion/safe_open.py +68 -0
lora_diffusion/to_ckpt_v2.py +232 -0
lora_diffusion/utils.py +214 -0
lora_diffusion/xformers_utils.py +70 -0
requirements.txt +3 -0
train_dreambooth_cloneofsimo_lora.py +1008 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ */__pycache__/
2	+ /.pyc

app.py CHANGED Viewed

@@ -1,16 +1,118 @@
 import gradio as gr
 import pandas as pd
-def load_csv(file):
-    df = pd.read_csv(file)
-    return df
-iface = gr.Interface(
-    fn=load_csv,
-    inputs="file",
-    outputs="dataframe",
-    title="CSV Loader",
-    description="Load a CSV file and display its contents.",
-)
-iface.launch()

 import gradio as gr
+import shutil
+import zipfile
+import tensorflow as tf
 import pandas as pd
+import pathlib
+import PIL.Image
+import os
+import subprocess
+def pad_image(image: PIL.Image.Image) -> PIL.Image.Image:
+    w, h = image.size
+    if w == h:
+        return image
+    elif w > h:
+        new_image = PIL.Image.new(image.mode, (w, w), (0, 0, 0))
+        new_image.paste(image, (0, (w - h) // 2))
+        return new_image
+    else:
+        new_image = PIL.Image.new(image.mode, (h, h), (0, 0, 0))
+        new_image.paste(image, ((h - w) // 2, 0))
+        return new_image
+class ModelTrainer:
+    def __init__(self):
+        self.training_pictures = []
+        self.training_model = None
+    def unzip_file(self, zip_file_path):
+        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
+            extracted_path = zip_file_path.replace('.zip', '')
+            zip_ref.extractall(extracted_path)
+            file_names = zip_ref.namelist()
+            for file_name in file_names:
+                if file_name.endswith(('.jpeg', '.jpg', '.png')):
+                    self.training_pictures.append(f'{extracted_path}/{file_name}')
+    def train(self, pretrained_model_name_or_path: str, instance_images: list | None):
+        output_model_name = 'a-xyz-model'
+        resolution = 512
+        repo_dir = pathlib.Path(__file__).parent
+        subdirs = ['train-instance', 'train-class', 'experiments']
+        dir_paths = []
+        for subdir in subdirs:
+            dir_path = repo_dir / subdir / output_model_name
+            dir_paths.append(dir_path)
+            shutil.rmtree(dir_path, ignore_errors=True)
+            os.makedirs(dir_path, exist_ok=True)
+        instance_data_dir, class_data_dir, output_dir = dir_paths
+        for i, temp_path in enumerate(instance_images):
+            image = PIL.Image.open(temp_path.name)
+            image = pad_image(image)
+            image = image.resize((resolution, resolution))
+            image = image.convert('RGB')
+            out_path = instance_data_dir / f'{i:03d}.jpg'
+            image.save(out_path, format='JPEG', quality=100)
+        command = [
+            'python', '-u',
+            'train_dreambooth_cloneofsimo_lora.py',
+            '--pretrained_model_name_or_path', pretrained_model_name_or_path,
+            '--instance_data_dir', instance_data_dir,
+            '--class_data_dir', class_data_dir,
+            '--resolution', '768',
+            '--output_dir', output_dir,
+            '--instance_prompt', 'a photo of a pwsm dog',
+            '--with_prior_preservation',
+            '--class_prompt', 'a dog',
+            '--prior_loss_weight', '1.0',
+            '--num_class_images', '100',
+            '--learning_rate', '0.0004',
+            '--train_batch_size', '1',
+            '--sample_batch_size', '1',
+            '--max_train_steps', '400',
+            '--gradient_accumulation_steps', '1',
+            '--gradient_checkpointing',
+            '--train_text_encoder',
+            '--learning_rate_text', '5e-6',
+            '--save_steps', '100',
+            '--seed', '1337',
+            '--lr_scheduler', 'constant',
+            '--lr_warmup_steps', '0'
+        ]
+        result = subprocess.run(command)
+        return result
+    def generate_picture(self, row):
+        num_of_training_steps, learning_rate, checkpoint_steps, abc = row
+        return f'Picture generated for num_of_training_steps: {num_of_training_steps}, learning_rate: {learning_rate}, checkpoint_steps: {checkpoint_steps}'
+    def generate_pictures(self, csv_input):
+        csv = pd.read_csv(csv_input.name)
+        result = []
+        for index, row in csv.iterrows():
+            result.append(self.generate_picture(row))
+        return "\n".join(str(item) for item in result)
+loader = ModelTrainer()
+with gr.Blocks() as demo:
+    with gr.Box():
+        instance_images = gr.Files(label='Instance images')
+        pretrained_model_name_or_path = gr.inputs.Textbox(lines=1, label='pretrained_model_name_or_path', default='stabilityai/stable-diffusion-2-1')
+        output_message = gr.Markdown()
+        train_button = gr.Button('Train')
+        train_button.click(fn=loader.train, inputs=[pretrained_model_name_or_path, instance_images], outputs=[output_message])
+    with gr.Box():
+        csv_input = gr.inputs.File(label='CSV File')
+        output_message2 = gr.Markdown()
+        generate_button = gr.Button('Generate Pictures from CSV')
+        generate_button.click(fn=loader.generate_pictures, inputs=[csv_input], outputs=[output_message2])
+demo.launch()

lora_diffusion/FOR-cloneofsimo-LoRA ADDED Viewed

	@@ -0,0 +1,6 @@

+This 'lora_diffusion' library in this subdirectory is required by
+'train_dreambooth_cloneofsimo_lora.py' script and is the underlying library in the
+https://github.com/cloneofsimo/lora project.
+The 'train_dreambooth_cloneofsimo_lora.py' script, in turn, is merely a renamed copy
+of 'traning_scripts/train_lora_dreambooth.py' from that same project.

lora_diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .lora import *
+from .dataset import *
+from .utils import *
+from .preprocess_files import *
+from .lora_manager import *

lora_diffusion/cli_lora_add.py ADDED Viewed

	@@ -0,0 +1,187 @@

+from typing import Literal, Union, Dict
+import os
+import shutil
+import fire
+from diffusers import StableDiffusionPipeline
+from safetensors.torch import safe_open, save_file
+import torch
+from .lora import (
+    tune_lora_scale,
+    patch_pipe,
+    collapse_lora,
+    monkeypatch_remove_lora,
+)
+from .lora_manager import lora_join
+from .to_ckpt_v2 import convert_to_ckpt
+def _text_lora_path(path: str) -> str:
+    assert path.endswith(".pt"), "Only .pt files are supported"
+    return ".".join(path.split(".")[:-1] + ["text_encoder", "pt"])
+def add(
+    path_1: str,
+    path_2: str,
+    output_path: str,
+    alpha_1: float = 0.5,
+    alpha_2: float = 0.5,
+    mode: Literal[
+        "lpl",
+        "upl",
+        "upl-ckpt-v2",
+    ] = "lpl",
+    with_text_lora: bool = False,
+):
+    print("Lora Add, mode " + mode)
+    if mode == "lpl":
+        if path_1.endswith(".pt") and path_2.endswith(".pt"):
+            for _path_1, _path_2, opt in [(path_1, path_2, "unet")] + (
+                [(_text_lora_path(path_1), _text_lora_path(path_2), "text_encoder")]
+                if with_text_lora
+                else []
+            ):
+                print("Loading", _path_1, _path_2)
+                out_list = []
+                if opt == "text_encoder":
+                    if not os.path.exists(_path_1):
+                        print(f"No text encoder found in {_path_1}, skipping...")
+                        continue
+                    if not os.path.exists(_path_2):
+                        print(f"No text encoder found in {_path_1}, skipping...")
+                        continue
+                l1 = torch.load(_path_1)
+                l2 = torch.load(_path_2)
+                l1pairs = zip(l1[::2], l1[1::2])
+                l2pairs = zip(l2[::2], l2[1::2])
+                for (x1, y1), (x2, y2) in zip(l1pairs, l2pairs):
+                    # print("Merging", x1.shape, y1.shape, x2.shape, y2.shape)
+                    x1.data = alpha_1 * x1.data + alpha_2 * x2.data
+                    y1.data = alpha_1 * y1.data + alpha_2 * y2.data
+                    out_list.append(x1)
+                    out_list.append(y1)
+                if opt == "unet":
+                    print("Saving merged UNET to", output_path)
+                    torch.save(out_list, output_path)
+                elif opt == "text_encoder":
+                    print("Saving merged text encoder to", _text_lora_path(output_path))
+                    torch.save(
+                        out_list,
+                        _text_lora_path(output_path),
+                    )
+        elif path_1.endswith(".safetensors") and path_2.endswith(".safetensors"):
+            safeloras_1 = safe_open(path_1, framework="pt", device="cpu")
+            safeloras_2 = safe_open(path_2, framework="pt", device="cpu")
+            metadata = dict(safeloras_1.metadata())
+            metadata.update(dict(safeloras_2.metadata()))
+            ret_tensor = {}
+            for keys in set(list(safeloras_1.keys()) + list(safeloras_2.keys())):
+                if keys.startswith("text_encoder") or keys.startswith("unet"):
+                    tens1 = safeloras_1.get_tensor(keys)
+                    tens2 = safeloras_2.get_tensor(keys)
+                    tens = alpha_1 * tens1 + alpha_2 * tens2
+                    ret_tensor[keys] = tens
+                else:
+                    if keys in safeloras_1.keys():
+                        tens1 = safeloras_1.get_tensor(keys)
+                    else:
+                        tens1 = safeloras_2.get_tensor(keys)
+                    ret_tensor[keys] = tens1
+            save_file(ret_tensor, output_path, metadata)
+    elif mode == "upl":
+        print(
+            f"Merging UNET/CLIP from {path_1} with LoRA from {path_2} to {output_path}. Merging ratio : {alpha_1}."
+        )
+        loaded_pipeline = StableDiffusionPipeline.from_pretrained(
+            path_1,
+        ).to("cpu")
+        patch_pipe(loaded_pipeline, path_2)
+        collapse_lora(loaded_pipeline.unet, alpha_1)
+        collapse_lora(loaded_pipeline.text_encoder, alpha_1)
+        monkeypatch_remove_lora(loaded_pipeline.unet)
+        monkeypatch_remove_lora(loaded_pipeline.text_encoder)
+        loaded_pipeline.save_pretrained(output_path)
+    elif mode == "upl-ckpt-v2":
+        assert output_path.endswith(".ckpt"), "Only .ckpt files are supported"
+        name = os.path.basename(output_path)[0:-5]
+        print(
+            f"You will be using {name} as the token in A1111 webui. Make sure {name} is unique enough token."
+        )
+        loaded_pipeline = StableDiffusionPipeline.from_pretrained(
+            path_1,
+        ).to("cpu")
+        tok_dict = patch_pipe(loaded_pipeline, path_2, patch_ti=False)
+        collapse_lora(loaded_pipeline.unet, alpha_1)
+        collapse_lora(loaded_pipeline.text_encoder, alpha_1)
+        monkeypatch_remove_lora(loaded_pipeline.unet)
+        monkeypatch_remove_lora(loaded_pipeline.text_encoder)
+        _tmp_output = output_path + ".tmp"
+        loaded_pipeline.save_pretrained(_tmp_output)
+        convert_to_ckpt(_tmp_output, output_path, as_half=True)
+        # remove the tmp_output folder
+        shutil.rmtree(_tmp_output)
+        keys = sorted(tok_dict.keys())
+        tok_catted = torch.stack([tok_dict[k] for k in keys])
+        ret = {
+            "string_to_token": {"*": torch.tensor(265)},
+            "string_to_param": {"*": tok_catted},
+            "name": name,
+        }
+        torch.save(ret, output_path[:-5] + ".pt")
+        print(
+            f"Textual embedding saved as {output_path[:-5]}.pt, put it in the embedding folder and use it as {name} in A1111 repo, "
+        )
+    elif mode == "ljl":
+        print("Using Join mode : alpha will not have an effect here.")
+        assert path_1.endswith(".safetensors") and path_2.endswith(
+            ".safetensors"
+        ), "Only .safetensors files are supported"
+        safeloras_1 = safe_open(path_1, framework="pt", device="cpu")
+        safeloras_2 = safe_open(path_2, framework="pt", device="cpu")
+        total_tensor, total_metadata, _, _ = lora_join([safeloras_1, safeloras_2])
+        save_file(total_tensor, output_path, total_metadata)
+    else:
+        print("Unknown mode", mode)
+        raise ValueError(f"Unknown mode {mode}")
+def main():
+    fire.Fire(add)

lora_diffusion/cli_lora_pti.py ADDED Viewed

	@@ -0,0 +1,1040 @@

+# Bootstrapped from:
+# https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py
+import argparse
+import hashlib
+import inspect
+import itertools
+import math
+import os
+import random
+import re
+from pathlib import Path
+from typing import Optional, List, Literal
+import torch
+import torch.nn.functional as F
+import torch.optim as optim
+import torch.utils.checkpoint
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from huggingface_hub import HfFolder, Repository, whoami
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+import wandb
+import fire
+from lora_diffusion import (
+    PivotalTuningDatasetCapation,
+    extract_lora_ups_down,
+    inject_trainable_lora,
+    inject_trainable_lora_extended,
+    inspect_lora,
+    save_lora_weight,
+    save_all,
+    prepare_clip_model_sets,
+    evaluate_pipe,
+    UNET_EXTENDED_TARGET_REPLACE,
+)
+def get_models(
+    pretrained_model_name_or_path,
+    pretrained_vae_name_or_path,
+    revision,
+    placeholder_tokens: List[str],
+    initializer_tokens: List[str],
+    device="cuda:0",
+):
+    tokenizer = CLIPTokenizer.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="tokenizer",
+        revision=revision,
+    )
+    text_encoder = CLIPTextModel.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=revision,
+    )
+    placeholder_token_ids = []
+    for token, init_tok in zip(placeholder_tokens, initializer_tokens):
+        num_added_tokens = tokenizer.add_tokens(token)
+        if num_added_tokens == 0:
+            raise ValueError(
+                f"The tokenizer already contains the token {token}. Please pass a different"
+                " `placeholder_token` that is not already in the tokenizer."
+            )
+        placeholder_token_id = tokenizer.convert_tokens_to_ids(token)
+        placeholder_token_ids.append(placeholder_token_id)
+        # Load models and create wrapper for stable diffusion
+        text_encoder.resize_token_embeddings(len(tokenizer))
+        token_embeds = text_encoder.get_input_embeddings().weight.data
+        if init_tok.startswith("<rand"):
+            # <rand-"sigma">, e.g. <rand-0.5>
+            sigma_val = float(re.findall(r"<rand-(.*)>", init_tok)[0])
+            token_embeds[placeholder_token_id] = (
+                torch.randn_like(token_embeds[0]) * sigma_val
+            )
+            print(
+                f"Initialized {token} with random noise (sigma={sigma_val}), empirically {token_embeds[placeholder_token_id].mean().item():.3f} +- {token_embeds[placeholder_token_id].std().item():.3f}"
+            )
+            print(f"Norm : {token_embeds[placeholder_token_id].norm():.4f}")
+        elif init_tok == "<zero>":
+            token_embeds[placeholder_token_id] = torch.zeros_like(token_embeds[0])
+        else:
+            token_ids = tokenizer.encode(init_tok, add_special_tokens=False)
+            # Check if initializer_token is a single token or a sequence of tokens
+            if len(token_ids) > 1:
+                raise ValueError("The initializer token must be a single token.")
+            initializer_token_id = token_ids[0]
+            token_embeds[placeholder_token_id] = token_embeds[initializer_token_id]
+    vae = AutoencoderKL.from_pretrained(
+        pretrained_vae_name_or_path or pretrained_model_name_or_path,
+        subfolder=None if pretrained_vae_name_or_path else "vae",
+        revision=None if pretrained_vae_name_or_path else revision,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="unet",
+        revision=revision,
+    )
+    return (
+        text_encoder.to(device),
+        vae.to(device),
+        unet.to(device),
+        tokenizer,
+        placeholder_token_ids,
+    )
+@torch.no_grad()
+def text2img_dataloader(
+    train_dataset,
+    train_batch_size,
+    tokenizer,
+    vae,
+    text_encoder,
+    cached_latents: bool = False,
+):
+    if cached_latents:
+        cached_latents_dataset = []
+        for idx in tqdm(range(len(train_dataset))):
+            batch = train_dataset[idx]
+            # rint(batch)
+            latents = vae.encode(
+                batch["instance_images"].unsqueeze(0).to(dtype=vae.dtype).to(vae.device)
+            ).latent_dist.sample()
+            latents = latents * 0.18215
+            batch["instance_images"] = latents.squeeze(0)
+            cached_latents_dataset.append(batch)
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = tokenizer.pad(
+            {"input_ids": input_ids},
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+        }
+        if examples[0].get("mask", None) is not None:
+            batch["mask"] = torch.stack([example["mask"] for example in examples])
+        return batch
+    if cached_latents:
+        train_dataloader = torch.utils.data.DataLoader(
+            cached_latents_dataset,
+            batch_size=train_batch_size,
+            shuffle=True,
+            collate_fn=collate_fn,
+        )
+        print("PTI : Using cached latent.")
+    else:
+        train_dataloader = torch.utils.data.DataLoader(
+            train_dataset,
+            batch_size=train_batch_size,
+            shuffle=True,
+            collate_fn=collate_fn,
+        )
+    return train_dataloader
+def inpainting_dataloader(
+    train_dataset, train_batch_size, tokenizer, vae, text_encoder
+):
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+        mask_values = [example["instance_masks"] for example in examples]
+        masked_image_values = [
+            example["instance_masked_images"] for example in examples
+        ]
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if examples[0].get("class_prompt_ids", None) is not None:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+            mask_values += [example["class_masks"] for example in examples]
+            masked_image_values += [
+                example["class_masked_images"] for example in examples
+            ]
+        pixel_values = (
+            torch.stack(pixel_values).to(memory_format=torch.contiguous_format).float()
+        )
+        mask_values = (
+            torch.stack(mask_values).to(memory_format=torch.contiguous_format).float()
+        )
+        masked_image_values = (
+            torch.stack(masked_image_values)
+            .to(memory_format=torch.contiguous_format)
+            .float()
+        )
+        input_ids = tokenizer.pad(
+            {"input_ids": input_ids},
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "mask_values": mask_values,
+            "masked_image_values": masked_image_values,
+        }
+        if examples[0].get("mask", None) is not None:
+            batch["mask"] = torch.stack([example["mask"] for example in examples])
+        return batch
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+    )
+    return train_dataloader
+def loss_step(
+    batch,
+    unet,
+    vae,
+    text_encoder,
+    scheduler,
+    train_inpainting=False,
+    t_mutliplier=1.0,
+    mixed_precision=False,
+    mask_temperature=1.0,
+    cached_latents: bool = False,
+):
+    weight_dtype = torch.float32
+    if not cached_latents:
+        latents = vae.encode(
+            batch["pixel_values"].to(dtype=weight_dtype).to(unet.device)
+        ).latent_dist.sample()
+        latents = latents * 0.18215
+        if train_inpainting:
+            masked_image_latents = vae.encode(
+                batch["masked_image_values"].to(dtype=weight_dtype).to(unet.device)
+            ).latent_dist.sample()
+            masked_image_latents = masked_image_latents * 0.18215
+            mask = F.interpolate(
+                batch["mask_values"].to(dtype=weight_dtype).to(unet.device),
+                scale_factor=1 / 8,
+            )
+    else:
+        latents = batch["pixel_values"]
+        if train_inpainting:
+            masked_image_latents = batch["masked_image_latents"]
+            mask = batch["mask_values"]
+    noise = torch.randn_like(latents)
+    bsz = latents.shape[0]
+    timesteps = torch.randint(
+        0,
+        int(scheduler.config.num_train_timesteps * t_mutliplier),
+        (bsz,),
+        device=latents.device,
+    )
+    timesteps = timesteps.long()
+    noisy_latents = scheduler.add_noise(latents, noise, timesteps)
+    if train_inpainting:
+        latent_model_input = torch.cat(
+            [noisy_latents, mask, masked_image_latents], dim=1
+        )
+    else:
+        latent_model_input = noisy_latents
+    if mixed_precision:
+        with torch.cuda.amp.autocast():
+            encoder_hidden_states = text_encoder(
+                batch["input_ids"].to(text_encoder.device)
+            )[0]
+            model_pred = unet(
+                latent_model_input, timesteps, encoder_hidden_states
+            ).sample
+    else:
+        encoder_hidden_states = text_encoder(
+            batch["input_ids"].to(text_encoder.device)
+        )[0]
+        model_pred = unet(latent_model_input, timesteps, encoder_hidden_states).sample
+    if scheduler.config.prediction_type == "epsilon":
+        target = noise
+    elif scheduler.config.prediction_type == "v_prediction":
+        target = scheduler.get_velocity(latents, noise, timesteps)
+    else:
+        raise ValueError(f"Unknown prediction type {scheduler.config.prediction_type}")
+    if batch.get("mask", None) is not None:
+        mask = (
+            batch["mask"]
+            .to(model_pred.device)
+            .reshape(
+                model_pred.shape[0], 1, model_pred.shape[2] * 8, model_pred.shape[3] * 8
+            )
+        )
+        # resize to match model_pred
+        mask = F.interpolate(
+            mask.float(),
+            size=model_pred.shape[-2:],
+            mode="nearest",
+        )
+        mask = (mask + 0.01).pow(mask_temperature)
+        mask = mask / mask.max()
+        model_pred = model_pred * mask
+        target = target * mask
+    loss = (
+        F.mse_loss(model_pred.float(), target.float(), reduction="none")
+        .mean([1, 2, 3])
+        .mean()
+    )
+    return loss
+def train_inversion(
+    unet,
+    vae,
+    text_encoder,
+    dataloader,
+    num_steps: int,
+    scheduler,
+    index_no_updates,
+    optimizer,
+    save_steps: int,
+    placeholder_token_ids,
+    placeholder_tokens,
+    save_path: str,
+    tokenizer,
+    lr_scheduler,
+    test_image_path: str,
+    cached_latents: bool,
+    accum_iter: int = 1,
+    log_wandb: bool = False,
+    wandb_log_prompt_cnt: int = 10,
+    class_token: str = "person",
+    train_inpainting: bool = False,
+    mixed_precision: bool = False,
+    clip_ti_decay: bool = True,
+):
+    progress_bar = tqdm(range(num_steps))
+    progress_bar.set_description("Steps")
+    global_step = 0
+    # Original Emb for TI
+    orig_embeds_params = text_encoder.get_input_embeddings().weight.data.clone()
+    if log_wandb:
+        preped_clip = prepare_clip_model_sets()
+    index_updates = ~index_no_updates
+    loss_sum = 0.0
+    for epoch in range(math.ceil(num_steps / len(dataloader))):
+        unet.eval()
+        text_encoder.train()
+        for batch in dataloader:
+            lr_scheduler.step()
+            with torch.set_grad_enabled(True):
+                loss = (
+                    loss_step(
+                        batch,
+                        unet,
+                        vae,
+                        text_encoder,
+                        scheduler,
+                        train_inpainting=train_inpainting,
+                        mixed_precision=mixed_precision,
+                        cached_latents=cached_latents,
+                    )
+                    / accum_iter
+                )
+                loss.backward()
+                loss_sum += loss.detach().item()
+                if global_step % accum_iter == 0:
+                    # print gradient of text encoder embedding
+                    print(
+                        text_encoder.get_input_embeddings()
+                        .weight.grad[index_updates, :]
+                        .norm(dim=-1)
+                        .mean()
+                    )
+                    optimizer.step()
+                    optimizer.zero_grad()
+                    with torch.no_grad():
+                        # normalize embeddings
+                        if clip_ti_decay:
+                            pre_norm = (
+                                text_encoder.get_input_embeddings()
+                                .weight[index_updates, :]
+                                .norm(dim=-1, keepdim=True)
+                            )
+                            lambda_ = min(1.0, 100 * lr_scheduler.get_last_lr()[0])
+                            text_encoder.get_input_embeddings().weight[
+                                index_updates
+                            ] = F.normalize(
+                                text_encoder.get_input_embeddings().weight[
+                                    index_updates, :
+                                ],
+                                dim=-1,
+                            ) * (
+                                pre_norm + lambda_ * (0.4 - pre_norm)
+                            )
+                            print(pre_norm)
+                        current_norm = (
+                            text_encoder.get_input_embeddings()
+                            .weight[index_updates, :]
+                            .norm(dim=-1)
+                        )
+                        text_encoder.get_input_embeddings().weight[
+                            index_no_updates
+                        ] = orig_embeds_params[index_no_updates]
+                        print(f"Current Norm : {current_norm}")
+                global_step += 1
+                progress_bar.update(1)
+                logs = {
+                    "loss": loss.detach().item(),
+                    "lr": lr_scheduler.get_last_lr()[0],
+                }
+                progress_bar.set_postfix(**logs)
+            if global_step % save_steps == 0:
+                save_all(
+                    unet=unet,
+                    text_encoder=text_encoder,
+                    placeholder_token_ids=placeholder_token_ids,
+                    placeholder_tokens=placeholder_tokens,
+                    save_path=os.path.join(
+                        save_path, f"step_inv_{global_step}.safetensors"
+                    ),
+                    save_lora=False,
+                )
+                if log_wandb:
+                    with torch.no_grad():
+                        pipe = StableDiffusionPipeline(
+                            vae=vae,
+                            text_encoder=text_encoder,
+                            tokenizer=tokenizer,
+                            unet=unet,
+                            scheduler=scheduler,
+                            safety_checker=None,
+                            feature_extractor=None,
+                        )
+                        # open all images in test_image_path
+                        images = []
+                        for file in os.listdir(test_image_path):
+                            if (
+                                file.lower().endswith(".png")
+                                or file.lower().endswith(".jpg")
+                                or file.lower().endswith(".jpeg")
+                            ):
+                                images.append(
+                                    Image.open(os.path.join(test_image_path, file))
+                                )
+                        wandb.log({"loss": loss_sum / save_steps})
+                        loss_sum = 0.0
+                        wandb.log(
+                            evaluate_pipe(
+                                pipe,
+                                target_images=images,
+                                class_token=class_token,
+                                learnt_token="".join(placeholder_tokens),
+                                n_test=wandb_log_prompt_cnt,
+                                n_step=50,
+                                clip_model_sets=preped_clip,
+                            )
+                        )
+            if global_step >= num_steps:
+                return
+def perform_tuning(
+    unet,
+    vae,
+    text_encoder,
+    dataloader,
+    num_steps,
+    scheduler,
+    optimizer,
+    save_steps: int,
+    placeholder_token_ids,
+    placeholder_tokens,
+    save_path,
+    lr_scheduler_lora,
+    lora_unet_target_modules,
+    lora_clip_target_modules,
+    mask_temperature,
+    out_name: str,
+    tokenizer,
+    test_image_path: str,
+    cached_latents: bool,
+    log_wandb: bool = False,
+    wandb_log_prompt_cnt: int = 10,
+    class_token: str = "person",
+    train_inpainting: bool = False,
+):
+    progress_bar = tqdm(range(num_steps))
+    progress_bar.set_description("Steps")
+    global_step = 0
+    weight_dtype = torch.float16
+    unet.train()
+    text_encoder.train()
+    if log_wandb:
+        preped_clip = prepare_clip_model_sets()
+    loss_sum = 0.0
+    for epoch in range(math.ceil(num_steps / len(dataloader))):
+        for batch in dataloader:
+            lr_scheduler_lora.step()
+            optimizer.zero_grad()
+            loss = loss_step(
+                batch,
+                unet,
+                vae,
+                text_encoder,
+                scheduler,
+                train_inpainting=train_inpainting,
+                t_mutliplier=0.8,
+                mixed_precision=True,
+                mask_temperature=mask_temperature,
+                cached_latents=cached_latents,
+            )
+            loss_sum += loss.detach().item()
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(
+                itertools.chain(unet.parameters(), text_encoder.parameters()), 1.0
+            )
+            optimizer.step()
+            progress_bar.update(1)
+            logs = {
+                "loss": loss.detach().item(),
+                "lr": lr_scheduler_lora.get_last_lr()[0],
+            }
+            progress_bar.set_postfix(**logs)
+            global_step += 1
+            if global_step % save_steps == 0:
+                save_all(
+                    unet,
+                    text_encoder,
+                    placeholder_token_ids=placeholder_token_ids,
+                    placeholder_tokens=placeholder_tokens,
+                    save_path=os.path.join(
+                        save_path, f"step_{global_step}.safetensors"
+                    ),
+                    target_replace_module_text=lora_clip_target_modules,
+                    target_replace_module_unet=lora_unet_target_modules,
+                )
+                moved = (
+                    torch.tensor(list(itertools.chain(*inspect_lora(unet).values())))
+                    .mean()
+                    .item()
+                )
+                print("LORA Unet Moved", moved)
+                moved = (
+                    torch.tensor(
+                        list(itertools.chain(*inspect_lora(text_encoder).values()))
+                    )
+                    .mean()
+                    .item()
+                )
+                print("LORA CLIP Moved", moved)
+                if log_wandb:
+                    with torch.no_grad():
+                        pipe = StableDiffusionPipeline(
+                            vae=vae,
+                            text_encoder=text_encoder,
+                            tokenizer=tokenizer,
+                            unet=unet,
+                            scheduler=scheduler,
+                            safety_checker=None,
+                            feature_extractor=None,
+                        )
+                        # open all images in test_image_path
+                        images = []
+                        for file in os.listdir(test_image_path):
+                            if file.endswith(".png") or file.endswith(".jpg"):
+                                images.append(
+                                    Image.open(os.path.join(test_image_path, file))
+                                )
+                        wandb.log({"loss": loss_sum / save_steps})
+                        loss_sum = 0.0
+                        wandb.log(
+                            evaluate_pipe(
+                                pipe,
+                                target_images=images,
+                                class_token=class_token,
+                                learnt_token="".join(placeholder_tokens),
+                                n_test=wandb_log_prompt_cnt,
+                                n_step=50,
+                                clip_model_sets=preped_clip,
+                            )
+                        )
+            if global_step >= num_steps:
+                break
+    save_all(
+        unet,
+        text_encoder,
+        placeholder_token_ids=placeholder_token_ids,
+        placeholder_tokens=placeholder_tokens,
+        save_path=os.path.join(save_path, f"{out_name}.safetensors"),
+        target_replace_module_text=lora_clip_target_modules,
+        target_replace_module_unet=lora_unet_target_modules,
+    )
+def train(
+    instance_data_dir: str,
+    pretrained_model_name_or_path: str,
+    output_dir: str,
+    train_text_encoder: bool = True,
+    pretrained_vae_name_or_path: str = None,
+    revision: Optional[str] = None,
+    perform_inversion: bool = True,
+    use_template: Literal[None, "object", "style"] = None,
+    train_inpainting: bool = False,
+    placeholder_tokens: str = "",
+    placeholder_token_at_data: Optional[str] = None,
+    initializer_tokens: Optional[str] = None,
+    seed: int = 42,
+    resolution: int = 512,
+    color_jitter: bool = True,
+    train_batch_size: int = 1,
+    sample_batch_size: int = 1,
+    max_train_steps_tuning: int = 1000,
+    max_train_steps_ti: int = 1000,
+    save_steps: int = 100,
+    gradient_accumulation_steps: int = 4,
+    gradient_checkpointing: bool = False,
+    lora_rank: int = 4,
+    lora_unet_target_modules={"CrossAttention", "Attention", "GEGLU"},
+    lora_clip_target_modules={"CLIPAttention"},
+    lora_dropout_p: float = 0.0,
+    lora_scale: float = 1.0,
+    use_extended_lora: bool = False,
+    clip_ti_decay: bool = True,
+    learning_rate_unet: float = 1e-4,
+    learning_rate_text: float = 1e-5,
+    learning_rate_ti: float = 5e-4,
+    continue_inversion: bool = False,
+    continue_inversion_lr: Optional[float] = None,
+    use_face_segmentation_condition: bool = False,
+    cached_latents: bool = True,
+    use_mask_captioned_data: bool = False,
+    mask_temperature: float = 1.0,
+    scale_lr: bool = False,
+    lr_scheduler: str = "linear",
+    lr_warmup_steps: int = 0,
+    lr_scheduler_lora: str = "linear",
+    lr_warmup_steps_lora: int = 0,
+    weight_decay_ti: float = 0.00,
+    weight_decay_lora: float = 0.001,
+    use_8bit_adam: bool = False,
+    device="cuda:0",
+    extra_args: Optional[dict] = None,
+    log_wandb: bool = False,
+    wandb_log_prompt_cnt: int = 10,
+    wandb_project_name: str = "new_pti_project",
+    wandb_entity: str = "new_pti_entity",
+    proxy_token: str = "person",
+    enable_xformers_memory_efficient_attention: bool = False,
+    out_name: str = "final_lora",
+):
+    torch.manual_seed(seed)
+    if log_wandb:
+        wandb.init(
+            project=wandb_project_name,
+            entity=wandb_entity,
+            name=f"steps_{max_train_steps_ti}_lr_{learning_rate_ti}_{instance_data_dir.split('/')[-1]}",
+            reinit=True,
+            config={
+                **(extra_args if extra_args is not None else {}),
+            },
+        )
+    if output_dir is not None:
+        os.makedirs(output_dir, exist_ok=True)
+    # print(placeholder_tokens, initializer_tokens)
+    if len(placeholder_tokens) == 0:
+        placeholder_tokens = []
+        print("PTI : Placeholder Tokens not given, using null token")
+    else:
+        placeholder_tokens = placeholder_tokens.split("|")
+        assert (
+            sorted(placeholder_tokens) == placeholder_tokens
+        ), f"Placeholder tokens should be sorted. Use something like {'|'.join(sorted(placeholder_tokens))}'"
+    if initializer_tokens is None:
+        print("PTI : Initializer Tokens not given, doing random inits")
+        initializer_tokens = ["<rand-0.017>"] * len(placeholder_tokens)
+    else:
+        initializer_tokens = initializer_tokens.split("|")
+    assert len(initializer_tokens) == len(
+        placeholder_tokens
+    ), "Unequal Initializer token for Placeholder tokens."
+    if proxy_token is not None:
+        class_token = proxy_token
+    class_token = "".join(initializer_tokens)
+    if placeholder_token_at_data is not None:
+        tok, pat = placeholder_token_at_data.split("|")
+        token_map = {tok: pat}
+    else:
+        token_map = {"DUMMY": "".join(placeholder_tokens)}
+    print("PTI : Placeholder Tokens", placeholder_tokens)
+    print("PTI : Initializer Tokens", initializer_tokens)
+    # get the models
+    text_encoder, vae, unet, tokenizer, placeholder_token_ids = get_models(
+        pretrained_model_name_or_path,
+        pretrained_vae_name_or_path,
+        revision,
+        placeholder_tokens,
+        initializer_tokens,
+        device=device,
+    )
+    noise_scheduler = DDPMScheduler.from_config(
+        pretrained_model_name_or_path, subfolder="scheduler"
+    )
+    if gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+    if enable_xformers_memory_efficient_attention:
+        from diffusers.utils.import_utils import is_xformers_available
+        if is_xformers_available():
+            unet.enable_xformers_memory_efficient_attention()
+        else:
+            raise ValueError(
+                "xformers is not available. Make sure it is installed correctly"
+            )
+    if scale_lr:
+        unet_lr = learning_rate_unet * gradient_accumulation_steps * train_batch_size
+        text_encoder_lr = (
+            learning_rate_text * gradient_accumulation_steps * train_batch_size
+        )
+        ti_lr = learning_rate_ti * gradient_accumulation_steps * train_batch_size
+    else:
+        unet_lr = learning_rate_unet
+        text_encoder_lr = learning_rate_text
+        ti_lr = learning_rate_ti
+    train_dataset = PivotalTuningDatasetCapation(
+        instance_data_root=instance_data_dir,
+        token_map=token_map,
+        use_template=use_template,
+        tokenizer=tokenizer,
+        size=resolution,
+        color_jitter=color_jitter,
+        use_face_segmentation_condition=use_face_segmentation_condition,
+        use_mask_captioned_data=use_mask_captioned_data,
+        train_inpainting=train_inpainting,
+    )
+    train_dataset.blur_amount = 200
+    if train_inpainting:
+        assert not cached_latents, "Cached latents not supported for inpainting"
+        train_dataloader = inpainting_dataloader(
+            train_dataset, train_batch_size, tokenizer, vae, text_encoder
+        )
+    else:
+        train_dataloader = text2img_dataloader(
+            train_dataset,
+            train_batch_size,
+            tokenizer,
+            vae,
+            text_encoder,
+            cached_latents=cached_latents,
+        )
+    index_no_updates = torch.arange(len(tokenizer)) != -1
+    for tok_id in placeholder_token_ids:
+        index_no_updates[tok_id] = False
+    unet.requires_grad_(False)
+    vae.requires_grad_(False)
+    params_to_freeze = itertools.chain(
+        text_encoder.text_model.encoder.parameters(),
+        text_encoder.text_model.final_layer_norm.parameters(),
+        text_encoder.text_model.embeddings.position_embedding.parameters(),
+    )
+    for param in params_to_freeze:
+        param.requires_grad = False
+    if cached_latents:
+        vae = None
+    # STEP 1 : Perform Inversion
+    if perform_inversion:
+        ti_optimizer = optim.AdamW(
+            text_encoder.get_input_embeddings().parameters(),
+            lr=ti_lr,
+            betas=(0.9, 0.999),
+            eps=1e-08,
+            weight_decay=weight_decay_ti,
+        )
+        lr_scheduler = get_scheduler(
+            lr_scheduler,
+            optimizer=ti_optimizer,
+            num_warmup_steps=lr_warmup_steps,
+            num_training_steps=max_train_steps_ti,
+        )
+        train_inversion(
+            unet,
+            vae,
+            text_encoder,
+            train_dataloader,
+            max_train_steps_ti,
+            cached_latents=cached_latents,
+            accum_iter=gradient_accumulation_steps,
+            scheduler=noise_scheduler,
+            index_no_updates=index_no_updates,
+            optimizer=ti_optimizer,
+            lr_scheduler=lr_scheduler,
+            save_steps=save_steps,
+            placeholder_tokens=placeholder_tokens,
+            placeholder_token_ids=placeholder_token_ids,
+            save_path=output_dir,
+            test_image_path=instance_data_dir,
+            log_wandb=log_wandb,
+            wandb_log_prompt_cnt=wandb_log_prompt_cnt,
+            class_token=class_token,
+            train_inpainting=train_inpainting,
+            mixed_precision=False,
+            tokenizer=tokenizer,
+            clip_ti_decay=clip_ti_decay,
+        )
+        del ti_optimizer
+    # Next perform Tuning with LoRA:
+    if not use_extended_lora:
+        unet_lora_params, _ = inject_trainable_lora(
+            unet,
+            r=lora_rank,
+            target_replace_module=lora_unet_target_modules,
+            dropout_p=lora_dropout_p,
+            scale=lora_scale,
+        )
+    else:
+        print("PTI : USING EXTENDED UNET!!!")
+        lora_unet_target_modules = (
+            lora_unet_target_modules | UNET_EXTENDED_TARGET_REPLACE
+        )
+        print("PTI : Will replace modules: ", lora_unet_target_modules)
+        unet_lora_params, _ = inject_trainable_lora_extended(
+            unet, r=lora_rank, target_replace_module=lora_unet_target_modules
+        )
+    print(f"PTI : has {len(unet_lora_params)} lora")
+    print("PTI : Before training:")
+    inspect_lora(unet)
+    params_to_optimize = [
+        {"params": itertools.chain(*unet_lora_params), "lr": unet_lr},
+    ]
+    text_encoder.requires_grad_(False)
+    if continue_inversion:
+        params_to_optimize += [
+            {
+                "params": text_encoder.get_input_embeddings().parameters(),
+                "lr": continue_inversion_lr
+                if continue_inversion_lr is not None
+                else ti_lr,
+            }
+        ]
+        text_encoder.requires_grad_(True)
+        params_to_freeze = itertools.chain(
+            text_encoder.text_model.encoder.parameters(),
+            text_encoder.text_model.final_layer_norm.parameters(),
+            text_encoder.text_model.embeddings.position_embedding.parameters(),
+        )
+        for param in params_to_freeze:
+            param.requires_grad = False
+    else:
+        text_encoder.requires_grad_(False)
+    if train_text_encoder:
+        text_encoder_lora_params, _ = inject_trainable_lora(
+            text_encoder,
+            target_replace_module=lora_clip_target_modules,
+            r=lora_rank,
+        )
+        params_to_optimize += [
+            {
+                "params": itertools.chain(*text_encoder_lora_params),
+                "lr": text_encoder_lr,
+            }
+        ]
+        inspect_lora(text_encoder)
+    lora_optimizers = optim.AdamW(params_to_optimize, weight_decay=weight_decay_lora)
+    unet.train()
+    if train_text_encoder:
+        text_encoder.train()
+    train_dataset.blur_amount = 70
+    lr_scheduler_lora = get_scheduler(
+        lr_scheduler_lora,
+        optimizer=lora_optimizers,
+        num_warmup_steps=lr_warmup_steps_lora,
+        num_training_steps=max_train_steps_tuning,
+    )
+    perform_tuning(
+        unet,
+        vae,
+        text_encoder,
+        train_dataloader,
+        max_train_steps_tuning,
+        cached_latents=cached_latents,
+        scheduler=noise_scheduler,
+        optimizer=lora_optimizers,
+        save_steps=save_steps,
+        placeholder_tokens=placeholder_tokens,
+        placeholder_token_ids=placeholder_token_ids,
+        save_path=output_dir,
+        lr_scheduler_lora=lr_scheduler_lora,
+        lora_unet_target_modules=lora_unet_target_modules,
+        lora_clip_target_modules=lora_clip_target_modules,
+        mask_temperature=mask_temperature,
+        tokenizer=tokenizer,
+        out_name=out_name,
+        test_image_path=instance_data_dir,
+        log_wandb=log_wandb,
+        wandb_log_prompt_cnt=wandb_log_prompt_cnt,
+        class_token=class_token,
+        train_inpainting=train_inpainting,
+    )
+def main():
+    fire.Fire(train)

lora_diffusion/cli_pt_to_safetensors.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import os
+import fire
+import torch
+from lora_diffusion import (
+    DEFAULT_TARGET_REPLACE,
+    TEXT_ENCODER_DEFAULT_TARGET_REPLACE,
+    UNET_DEFAULT_TARGET_REPLACE,
+    convert_loras_to_safeloras_with_embeds,
+    safetensors_available,
+)
+_target_by_name = {
+    "unet": UNET_DEFAULT_TARGET_REPLACE,
+    "text_encoder": TEXT_ENCODER_DEFAULT_TARGET_REPLACE,
+}
+def convert(*paths, outpath, overwrite=False, **settings):
+    """
+    Converts one or more pytorch Lora and/or Textual Embedding pytorch files
+    into a safetensor file.
+    Pass all the input paths as arguments. Whether they are Textual Embedding
+    or Lora models will be auto-detected.
+    For Lora models, their name will be taken from the path, i.e.
+        "lora_weight.pt" => unet
+        "lora_weight.text_encoder.pt" => text_encoder
+    You can also set target_modules and/or rank by providing an argument prefixed
+    by the name.
+    So a complete example might be something like:
+    ```
+    python -m lora_diffusion.cli_pt_to_safetensors lora_weight.* --outpath lora_weight.safetensor --unet.rank 8
+    ```
+    """
+    modelmap = {}
+    embeds = {}
+    if os.path.exists(outpath) and not overwrite:
+        raise ValueError(
+            f"Output path {outpath} already exists, and overwrite is not True"
+        )
+    for path in paths:
+        data = torch.load(path)
+        if isinstance(data, dict):
+            print(f"Loading textual inversion embeds {data.keys()} from {path}")
+            embeds.update(data)
+        else:
+            name_parts = os.path.split(path)[1].split(".")
+            name = name_parts[-2] if len(name_parts) > 2 else "unet"
+            model_settings = {
+                "target_modules": _target_by_name.get(name, DEFAULT_TARGET_REPLACE),
+                "rank": 4,
+            }
+            prefix = f"{name}."
+            arg_settings = { k[len(prefix) :]: v for k, v in settings.items() if k.startswith(prefix) }
+            model_settings = { **model_settings, **arg_settings }
+            print(f"Loading Lora for {name} from {path} with settings {model_settings}")
+            modelmap[name] = (
+                path,
+                model_settings["target_modules"],
+                model_settings["rank"],
+            )
+    convert_loras_to_safeloras_with_embeds(modelmap, embeds, outpath)
+def main():
+    fire.Fire(convert)
+if __name__ == "__main__":
+    main()

lora_diffusion/cli_svd.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import fire
+from diffusers import StableDiffusionPipeline
+import torch
+import torch.nn as nn
+from .lora import (
+    save_all,
+    _find_modules,
+    LoraInjectedConv2d,
+    LoraInjectedLinear,
+    inject_trainable_lora,
+    inject_trainable_lora_extended,
+)
+def _iter_lora(model):
+    for module in model.modules():
+        if isinstance(module, LoraInjectedConv2d) or isinstance(
+            module, LoraInjectedLinear
+        ):
+            yield module
+def overwrite_base(base_model, tuned_model, rank, clamp_quantile):
+    device = base_model.device
+    dtype = base_model.dtype
+    for lor_base, lor_tune in zip(_iter_lora(base_model), _iter_lora(tuned_model)):
+        if isinstance(lor_base, LoraInjectedLinear):
+            residual = lor_tune.linear.weight.data - lor_base.linear.weight.data
+            # SVD on residual
+            print("Distill Linear shape ", residual.shape)
+            residual = residual.float()
+            U, S, Vh = torch.linalg.svd(residual)
+            U = U[:, :rank]
+            S = S[:rank]
+            U = U @ torch.diag(S)
+            Vh = Vh[:rank, :]
+            dist = torch.cat([U.flatten(), Vh.flatten()])
+            hi_val = torch.quantile(dist, clamp_quantile)
+            low_val = -hi_val
+            U = U.clamp(low_val, hi_val)
+            Vh = Vh.clamp(low_val, hi_val)
+            assert lor_base.lora_up.weight.shape == U.shape
+            assert lor_base.lora_down.weight.shape == Vh.shape
+            lor_base.lora_up.weight.data = U.to(device=device, dtype=dtype)
+            lor_base.lora_down.weight.data = Vh.to(device=device, dtype=dtype)
+        if isinstance(lor_base, LoraInjectedConv2d):
+            residual = lor_tune.conv.weight.data - lor_base.conv.weight.data
+            print("Distill Conv shape ", residual.shape)
+            residual = residual.float()
+            residual = residual.flatten(start_dim=1)
+            # SVD on residual
+            U, S, Vh = torch.linalg.svd(residual)
+            U = U[:, :rank]
+            S = S[:rank]
+            U = U @ torch.diag(S)
+            Vh = Vh[:rank, :]
+            dist = torch.cat([U.flatten(), Vh.flatten()])
+            hi_val = torch.quantile(dist, clamp_quantile)
+            low_val = -hi_val
+            U = U.clamp(low_val, hi_val)
+            Vh = Vh.clamp(low_val, hi_val)
+            # U is (out_channels, rank) with 1x1 conv. So,
+            U = U.reshape(U.shape[0], U.shape[1], 1, 1)
+            # V is (rank, in_channels * kernel_size1 * kernel_size2)
+            # now reshape:
+            Vh = Vh.reshape(
+                Vh.shape[0],
+                lor_base.conv.in_channels,
+                lor_base.conv.kernel_size[0],
+                lor_base.conv.kernel_size[1],
+            )
+            assert lor_base.lora_up.weight.shape == U.shape
+            assert lor_base.lora_down.weight.shape == Vh.shape
+            lor_base.lora_up.weight.data = U.to(device=device, dtype=dtype)
+            lor_base.lora_down.weight.data = Vh.to(device=device, dtype=dtype)
+def svd_distill(
+    target_model: str,
+    base_model: str,
+    rank: int = 4,
+    clamp_quantile: float = 0.99,
+    device: str = "cuda:0",
+    save_path: str = "svd_distill.safetensors",
+):
+    pipe_base = StableDiffusionPipeline.from_pretrained(
+        base_model, torch_dtype=torch.float16
+    ).to(device)
+    pipe_tuned = StableDiffusionPipeline.from_pretrained(
+        target_model, torch_dtype=torch.float16
+    ).to(device)
+    # Inject unet
+    _ = inject_trainable_lora_extended(pipe_base.unet, r=rank)
+    _ = inject_trainable_lora_extended(pipe_tuned.unet, r=rank)
+    overwrite_base(
+        pipe_base.unet, pipe_tuned.unet, rank=rank, clamp_quantile=clamp_quantile
+    )
+    # Inject text encoder
+    _ = inject_trainable_lora(
+        pipe_base.text_encoder, r=rank, target_replace_module={"CLIPAttention"}
+    )
+    _ = inject_trainable_lora(
+        pipe_tuned.text_encoder, r=rank, target_replace_module={"CLIPAttention"}
+    )
+    overwrite_base(
+        pipe_base.text_encoder,
+        pipe_tuned.text_encoder,
+        rank=rank,
+        clamp_quantile=clamp_quantile,
+    )
+    save_all(
+        unet=pipe_base.unet,
+        text_encoder=pipe_base.text_encoder,
+        placeholder_token_ids=None,
+        placeholder_tokens=None,
+        save_path=save_path,
+        save_lora=True,
+        save_ti=False,
+    )
+def main():
+    fire.Fire(svd_distill)

lora_diffusion/dataset.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import random
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple, Union
+from PIL import Image
+from torch import zeros_like
+from torch.utils.data import Dataset
+from torchvision import transforms
+import glob
+from .preprocess_files import face_mask_google_mediapipe
+OBJECT_TEMPLATE = [
+    "a photo of a {}",
+    "a rendering of a {}",
+    "a cropped photo of the {}",
+    "the photo of a {}",
+    "a photo of a clean {}",
+    "a photo of a dirty {}",
+    "a dark photo of the {}",
+    "a photo of my {}",
+    "a photo of the cool {}",
+    "a close-up photo of a {}",
+    "a bright photo of the {}",
+    "a cropped photo of a {}",
+    "a photo of the {}",
+    "a good photo of the {}",
+    "a photo of one {}",
+    "a close-up photo of the {}",
+    "a rendition of the {}",
+    "a photo of the clean {}",
+    "a rendition of a {}",
+    "a photo of a nice {}",
+    "a good photo of a {}",
+    "a photo of the nice {}",
+    "a photo of the small {}",
+    "a photo of the weird {}",
+    "a photo of the large {}",
+    "a photo of a cool {}",
+    "a photo of a small {}",
+]
+STYLE_TEMPLATE = [
+    "a painting in the style of {}",
+    "a rendering in the style of {}",
+    "a cropped painting in the style of {}",
+    "the painting in the style of {}",
+    "a clean painting in the style of {}",
+    "a dirty painting in the style of {}",
+    "a dark painting in the style of {}",
+    "a picture in the style of {}",
+    "a cool painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a bright painting in the style of {}",
+    "a cropped painting in the style of {}",
+    "a good painting in the style of {}",
+    "a close-up painting in the style of {}",
+    "a rendition in the style of {}",
+    "a nice painting in the style of {}",
+    "a small painting in the style of {}",
+    "a weird painting in the style of {}",
+    "a large painting in the style of {}",
+]
+NULL_TEMPLATE = ["{}"]
+TEMPLATE_MAP = {
+    "object": OBJECT_TEMPLATE,
+    "style": STYLE_TEMPLATE,
+    "null": NULL_TEMPLATE,
+}
+def _randomset(lis):
+    ret = []
+    for i in range(len(lis)):
+        if random.random() < 0.5:
+            ret.append(lis[i])
+    return ret
+def _shuffle(lis):
+    return random.sample(lis, len(lis))
+def _get_cutout_holes(
+    height,
+    width,
+    min_holes=8,
+    max_holes=32,
+    min_height=16,
+    max_height=128,
+    min_width=16,
+    max_width=128,
+):
+    holes = []
+    for _n in range(random.randint(min_holes, max_holes)):
+        hole_height = random.randint(min_height, max_height)
+        hole_width = random.randint(min_width, max_width)
+        y1 = random.randint(0, height - hole_height)
+        x1 = random.randint(0, width - hole_width)
+        y2 = y1 + hole_height
+        x2 = x1 + hole_width
+        holes.append((x1, y1, x2, y2))
+    return holes
+def _generate_random_mask(image):
+    mask = zeros_like(image[:1])
+    holes = _get_cutout_holes(mask.shape[1], mask.shape[2])
+    for (x1, y1, x2, y2) in holes:
+        mask[:, y1:y2, x1:x2] = 1.0
+    if random.uniform(0, 1) < 0.25:
+        mask.fill_(1.0)
+    masked_image = image * (mask < 0.5)
+    return mask, masked_image
+class PivotalTuningDatasetCapation(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+    def __init__(
+        self,
+        instance_data_root,
+        tokenizer,
+        token_map: Optional[dict] = None,
+        use_template: Optional[str] = None,
+        size=512,
+        h_flip=True,
+        color_jitter=False,
+        resize=True,
+        use_mask_captioned_data=False,
+        use_face_segmentation_condition=False,
+        train_inpainting=False,
+        blur_amount: int = 70,
+    ):
+        self.size = size
+        self.tokenizer = tokenizer
+        self.resize = resize
+        self.train_inpainting = train_inpainting
+        instance_data_root = Path(instance_data_root)
+        if not instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+        self.instance_images_path = []
+        self.mask_path = []
+        assert not (
+            use_mask_captioned_data and use_template
+        ), "Can't use both mask caption data and template."
+        # Prepare the instance images
+        if use_mask_captioned_data:
+            src_imgs = glob.glob(str(instance_data_root) + "/*src.jpg")
+            for f in src_imgs:
+                idx = int(str(Path(f).stem).split(".")[0])
+                mask_path = f"{instance_data_root}/{idx}.mask.png"
+                if Path(mask_path).exists():
+                    self.instance_images_path.append(f)
+                    self.mask_path.append(mask_path)
+                else:
+                    print(f"Mask not found for {f}")
+            self.captions = open(f"{instance_data_root}/caption.txt").readlines()
+        else:
+            possibily_src_images = (
+                glob.glob(str(instance_data_root) + "/*.jpg")
+                + glob.glob(str(instance_data_root) + "/*.png")
+                + glob.glob(str(instance_data_root) + "/*.jpeg")
+            )
+            possibily_src_images = (
+                set(possibily_src_images)
+                - set(glob.glob(str(instance_data_root) + "/*mask.png"))
+                - set([str(instance_data_root) + "/caption.txt"])
+            )
+            self.instance_images_path = list(set(possibily_src_images))
+            self.captions = [
+                x.split("/")[-1].split(".")[0] for x in self.instance_images_path
+            ]
+        assert (
+            len(self.instance_images_path) > 0
+        ), "No images found in the instance data root."
+        self.instance_images_path = sorted(self.instance_images_path)
+        self.use_mask = use_face_segmentation_condition or use_mask_captioned_data
+        self.use_mask_captioned_data = use_mask_captioned_data
+        if use_face_segmentation_condition:
+            for idx in range(len(self.instance_images_path)):
+                targ = f"{instance_data_root}/{idx}.mask.png"
+                # see if the mask exists
+                if not Path(targ).exists():
+                    print(f"Mask not found for {targ}")
+                    print(
+                        "Warning : this will pre-process all the images in the instance data root."
+                    )
+                    if len(self.mask_path) > 0:
+                        print(
+                            "Warning : masks already exists, but will be overwritten."
+                        )
+                    masks = face_mask_google_mediapipe(
+                        [
+                            Image.open(f).convert("RGB")
+                            for f in self.instance_images_path
+                        ]
+                    )
+                    for idx, mask in enumerate(masks):
+                        mask.save(f"{instance_data_root}/{idx}.mask.png")
+                    break
+            for idx in range(len(self.instance_images_path)):
+                self.mask_path.append(f"{instance_data_root}/{idx}.mask.png")
+        self.num_instance_images = len(self.instance_images_path)
+        self.token_map = token_map
+        self.use_template = use_template
+        if use_template is not None:
+            self.templates = TEMPLATE_MAP[use_template]
+        self._length = self.num_instance_images
+        self.h_flip = h_flip
+        self.image_transforms = transforms.Compose(
+            [
+                transforms.Resize(
+                    size, interpolation=transforms.InterpolationMode.BILINEAR
+                )
+                if resize
+                else transforms.Lambda(lambda x: x),
+                transforms.ColorJitter(0.1, 0.1)
+                if color_jitter
+                else transforms.Lambda(lambda x: x),
+                transforms.CenterCrop(size),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        self.blur_amount = blur_amount
+    def __len__(self):
+        return self._length
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(
+            self.instance_images_path[index % self.num_instance_images]
+        )
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        if self.train_inpainting:
+            (
+                example["instance_masks"],
+                example["instance_masked_images"],
+            ) = _generate_random_mask(example["instance_images"])
+        if self.use_template:
+            assert self.token_map is not None
+            input_tok = list(self.token_map.values())[0]
+            text = random.choice(self.templates).format(input_tok)
+        else:
+            text = self.captions[index % self.num_instance_images].strip()
+            if self.token_map is not None:
+                for token, value in self.token_map.items():
+                    text = text.replace(token, value)
+        print(text)
+        if self.use_mask:
+            example["mask"] = (
+                self.image_transforms(
+                    Image.open(self.mask_path[index % self.num_instance_images])
+                )
+                * 0.5
+                + 1.0
+            )
+        if self.h_flip and random.random() > 0.5:
+            hflip = transforms.RandomHorizontalFlip(p=1)
+            example["instance_images"] = hflip(example["instance_images"])
+            if self.use_mask:
+                example["mask"] = hflip(example["mask"])
+        example["instance_prompt_ids"] = self.tokenizer(
+            text,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+        return example

lora_diffusion/lora.py ADDED Viewed

	@@ -0,0 +1,1110 @@

+import json
+import math
+from itertools import groupby
+from typing import Callable, Dict, List, Optional, Set, Tuple, Type, Union
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from safetensors.torch import safe_open
+    from safetensors.torch import save_file as safe_save
+    safetensors_available = True
+except ImportError:
+    from .safe_open import safe_open
+    def safe_save(
+        tensors: Dict[str, torch.Tensor],
+        filename: str,
+        metadata: Optional[Dict[str, str]] = None,
+    ) -> None:
+        raise EnvironmentError(
+            "Saving safetensors requires the safetensors library. Please install with pip or similar."
+        )
+    safetensors_available = False
+class LoraInjectedLinear(nn.Module):
+    def __init__(
+        self, in_features, out_features, bias=False, r=4, dropout_p=0.1, scale=1.0
+    ):
+        super().__init__()
+        if r > min(in_features, out_features):
+            raise ValueError(
+                f"LoRA rank {r} must be less or equal than {min(in_features, out_features)}"
+            )
+        self.r = r
+        self.linear = nn.Linear(in_features, out_features, bias)
+        self.lora_down = nn.Linear(in_features, r, bias=False)
+        self.dropout = nn.Dropout(dropout_p)
+        self.lora_up = nn.Linear(r, out_features, bias=False)
+        self.scale = scale
+        self.selector = nn.Identity()
+        nn.init.normal_(self.lora_down.weight, std=1 / r)
+        nn.init.zeros_(self.lora_up.weight)
+    def forward(self, input):
+        return (
+            self.linear(input)
+            + self.dropout(self.lora_up(self.selector(self.lora_down(input))))
+            * self.scale
+        )
+    def realize_as_lora(self):
+        return self.lora_up.weight.data * self.scale, self.lora_down.weight.data
+    def set_selector_from_diag(self, diag: torch.Tensor):
+        # diag is a 1D tensor of size (r,)
+        assert diag.shape == (self.r,)
+        self.selector = nn.Linear(self.r, self.r, bias=False)
+        self.selector.weight.data = torch.diag(diag)
+        self.selector.weight.data = self.selector.weight.data.to(
+            self.lora_up.weight.device
+        ).to(self.lora_up.weight.dtype)
+class LoraInjectedConv2d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups: int = 1,
+        bias: bool = True,
+        r: int = 4,
+        dropout_p: float = 0.1,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        if r > min(in_channels, out_channels):
+            raise ValueError(
+                f"LoRA rank {r} must be less or equal than {min(in_channels, out_channels)}"
+            )
+        self.r = r
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.lora_down = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=r,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(dropout_p)
+        self.lora_up = nn.Conv2d(
+            in_channels=r,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.selector = nn.Identity()
+        self.scale = scale
+        nn.init.normal_(self.lora_down.weight, std=1 / r)
+        nn.init.zeros_(self.lora_up.weight)
+    def forward(self, input):
+        return (
+            self.conv(input)
+            + self.dropout(self.lora_up(self.selector(self.lora_down(input))))
+            * self.scale
+        )
+    def realize_as_lora(self):
+        return self.lora_up.weight.data * self.scale, self.lora_down.weight.data
+    def set_selector_from_diag(self, diag: torch.Tensor):
+        # diag is a 1D tensor of size (r,)
+        assert diag.shape == (self.r,)
+        self.selector = nn.Conv2d(
+            in_channels=self.r,
+            out_channels=self.r,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.selector.weight.data = torch.diag(diag)
+        # same device + dtype as lora_up
+        self.selector.weight.data = self.selector.weight.data.to(
+            self.lora_up.weight.device
+        ).to(self.lora_up.weight.dtype)
+UNET_DEFAULT_TARGET_REPLACE = {"CrossAttention", "Attention", "GEGLU"}
+UNET_EXTENDED_TARGET_REPLACE = {"ResnetBlock2D", "CrossAttention", "Attention", "GEGLU"}
+TEXT_ENCODER_DEFAULT_TARGET_REPLACE = {"CLIPAttention"}
+TEXT_ENCODER_EXTENDED_TARGET_REPLACE = {"CLIPAttention"}
+DEFAULT_TARGET_REPLACE = UNET_DEFAULT_TARGET_REPLACE
+EMBED_FLAG = "<embed>"
+def _find_children(
+    model,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+):
+    """
+    Find all modules of a certain class (or union of classes).
+    Returns all matching modules, along with the parent of those moduless and the
+    names they are referenced by.
+    """
+    # For each target find every linear_class module that isn't a child of a LoraInjectedLinear
+    for parent in model.modules():
+        for name, module in parent.named_children():
+            if any([isinstance(module, _class) for _class in search_class]):
+                yield parent, name, module
+def _find_modules_v2(
+    model,
+    ancestor_class: Optional[Set[str]] = None,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+    exclude_children_of: Optional[List[Type[nn.Module]]] = [
+        LoraInjectedLinear,
+        LoraInjectedConv2d,
+    ],
+):
+    """
+    Find all modules of a certain class (or union of classes) that are direct or
+    indirect descendants of other modules of a certain class (or union of classes).
+    Returns all matching modules, along with the parent of those moduless and the
+    names they are referenced by.
+    """
+    # Get the targets we should replace all linears under
+    if ancestor_class is not None:
+        ancestors = (
+            module
+            for module in model.modules()
+            if module.__class__.__name__ in ancestor_class
+        )
+    else:
+        # this, incase you want to naively iterate over all modules.
+        ancestors = [module for module in model.modules()]
+    # For each target find every linear_class module that isn't a child of a LoraInjectedLinear
+    for ancestor in ancestors:
+        for fullname, module in ancestor.named_modules():
+            if any([isinstance(module, _class) for _class in search_class]):
+                # Find the direct parent if this is a descendant, not a child, of target
+                *path, name = fullname.split(".")
+                parent = ancestor
+                while path:
+                    parent = parent.get_submodule(path.pop(0))
+                # Skip this linear if it's a child of a LoraInjectedLinear
+                if exclude_children_of and any(
+                    [isinstance(parent, _class) for _class in exclude_children_of]
+                ):
+                    continue
+                # Otherwise, yield it
+                yield parent, name, module
+def _find_modules_old(
+    model,
+    ancestor_class: Set[str] = DEFAULT_TARGET_REPLACE,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+    exclude_children_of: Optional[List[Type[nn.Module]]] = [LoraInjectedLinear],
+):
+    ret = []
+    for _module in model.modules():
+        if _module.__class__.__name__ in ancestor_class:
+            for name, _child_module in _module.named_modules():
+                if _child_module.__class__ in search_class:
+                    ret.append((_module, name, _child_module))
+    print(ret)
+    return ret
+_find_modules = _find_modules_v2
+def inject_trainable_lora(
+    model: nn.Module,
+    target_replace_module: Set[str] = DEFAULT_TARGET_REPLACE,
+    r: int = 4,
+    loras=None,  # path to lora .pt
+    verbose: bool = False,
+    dropout_p: float = 0.0,
+    scale: float = 1.0,
+):
+    """
+    inject lora into model, and returns lora parameter groups.
+    """
+    require_grad_params = []
+    names = []
+    if loras != None:
+        loras = torch.load(loras)
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[nn.Linear]
+    ):
+        weight = _child_module.weight
+        bias = _child_module.bias
+        if verbose:
+            print("LoRA Injection : injecting lora into ", name)
+            print("LoRA Injection : weight shape", weight.shape)
+        _tmp = LoraInjectedLinear(
+            _child_module.in_features,
+            _child_module.out_features,
+            _child_module.bias is not None,
+            r=r,
+            dropout_p=dropout_p,
+            scale=scale,
+        )
+        _tmp.linear.weight = weight
+        if bias is not None:
+            _tmp.linear.bias = bias
+        # switch the module
+        _tmp.to(_child_module.weight.device).to(_child_module.weight.dtype)
+        _module._modules[name] = _tmp
+        require_grad_params.append(_module._modules[name].lora_up.parameters())
+        require_grad_params.append(_module._modules[name].lora_down.parameters())
+        if loras != None:
+            _module._modules[name].lora_up.weight = loras.pop(0)
+            _module._modules[name].lora_down.weight = loras.pop(0)
+        _module._modules[name].lora_up.weight.requires_grad = True
+        _module._modules[name].lora_down.weight.requires_grad = True
+        names.append(name)
+    return require_grad_params, names
+def inject_trainable_lora_extended(
+    model: nn.Module,
+    target_replace_module: Set[str] = UNET_EXTENDED_TARGET_REPLACE,
+    r: int = 4,
+    loras=None,  # path to lora .pt
+):
+    """
+    inject lora into model, and returns lora parameter groups.
+    """
+    require_grad_params = []
+    names = []
+    if loras != None:
+        loras = torch.load(loras)
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[nn.Linear, nn.Conv2d]
+    ):
+        if _child_module.__class__ == nn.Linear:
+            weight = _child_module.weight
+            bias = _child_module.bias
+            _tmp = LoraInjectedLinear(
+                _child_module.in_features,
+                _child_module.out_features,
+                _child_module.bias is not None,
+                r=r,
+            )
+            _tmp.linear.weight = weight
+            if bias is not None:
+                _tmp.linear.bias = bias
+        elif _child_module.__class__ == nn.Conv2d:
+            weight = _child_module.weight
+            bias = _child_module.bias
+            _tmp = LoraInjectedConv2d(
+                _child_module.in_channels,
+                _child_module.out_channels,
+                _child_module.kernel_size,
+                _child_module.stride,
+                _child_module.padding,
+                _child_module.dilation,
+                _child_module.groups,
+                _child_module.bias is not None,
+                r=r,
+            )
+            _tmp.conv.weight = weight
+            if bias is not None:
+                _tmp.conv.bias = bias
+        # switch the module
+        _tmp.to(_child_module.weight.device).to(_child_module.weight.dtype)
+        if bias is not None:
+            _tmp.to(_child_module.bias.device).to(_child_module.bias.dtype)
+        _module._modules[name] = _tmp
+        require_grad_params.append(_module._modules[name].lora_up.parameters())
+        require_grad_params.append(_module._modules[name].lora_down.parameters())
+        if loras != None:
+            _module._modules[name].lora_up.weight = loras.pop(0)
+            _module._modules[name].lora_down.weight = loras.pop(0)
+        _module._modules[name].lora_up.weight.requires_grad = True
+        _module._modules[name].lora_down.weight.requires_grad = True
+        names.append(name)
+    return require_grad_params, names
+def extract_lora_ups_down(model, target_replace_module=DEFAULT_TARGET_REPLACE):
+    loras = []
+    for _m, _n, _child_module in _find_modules(
+        model,
+        target_replace_module,
+        search_class=[LoraInjectedLinear, LoraInjectedConv2d],
+    ):
+        loras.append((_child_module.lora_up, _child_module.lora_down))
+    if len(loras) == 0:
+        raise ValueError("No lora injected.")
+    return loras
+def extract_lora_as_tensor(
+    model, target_replace_module=DEFAULT_TARGET_REPLACE, as_fp16=True
+):
+    loras = []
+    for _m, _n, _child_module in _find_modules(
+        model,
+        target_replace_module,
+        search_class=[LoraInjectedLinear, LoraInjectedConv2d],
+    ):
+        up, down = _child_module.realize_as_lora()
+        if as_fp16:
+            up = up.to(torch.float16)
+            down = down.to(torch.float16)
+        loras.append((up, down))
+    if len(loras) == 0:
+        raise ValueError("No lora injected.")
+    return loras
+def save_lora_weight(
+    model,
+    path="./lora.pt",
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+):
+    weights = []
+    for _up, _down in extract_lora_ups_down(
+        model, target_replace_module=target_replace_module
+    ):
+        weights.append(_up.weight.to("cpu").to(torch.float16))
+        weights.append(_down.weight.to("cpu").to(torch.float16))
+    torch.save(weights, path)
+def save_lora_as_json(model, path="./lora.json"):
+    weights = []
+    for _up, _down in extract_lora_ups_down(model):
+        weights.append(_up.weight.detach().cpu().numpy().tolist())
+        weights.append(_down.weight.detach().cpu().numpy().tolist())
+    import json
+    with open(path, "w") as f:
+        json.dump(weights, f)
+def save_safeloras_with_embeds(
+    modelmap: Dict[str, Tuple[nn.Module, Set[str]]] = {},
+    embeds: Dict[str, torch.Tensor] = {},
+    outpath="./lora.safetensors",
+):
+    """
+    Saves the Lora from multiple modules in a single safetensor file.
+    modelmap is a dictionary of {
+        "module name": (module, target_replace_module)
+    }
+    """
+    weights = {}
+    metadata = {}
+    for name, (model, target_replace_module) in modelmap.items():
+        metadata[name] = json.dumps(list(target_replace_module))
+        for i, (_up, _down) in enumerate(
+            extract_lora_as_tensor(model, target_replace_module)
+        ):
+            rank = _down.shape[0]
+            metadata[f"{name}:{i}:rank"] = str(rank)
+            weights[f"{name}:{i}:up"] = _up
+            weights[f"{name}:{i}:down"] = _down
+    for token, tensor in embeds.items():
+        metadata[token] = EMBED_FLAG
+        weights[token] = tensor
+    print(f"Saving weights to {outpath}")
+    safe_save(weights, outpath, metadata)
+def save_safeloras(
+    modelmap: Dict[str, Tuple[nn.Module, Set[str]]] = {},
+    outpath="./lora.safetensors",
+):
+    return save_safeloras_with_embeds(modelmap=modelmap, outpath=outpath)
+def convert_loras_to_safeloras_with_embeds(
+    modelmap: Dict[str, Tuple[str, Set[str], int]] = {},
+    embeds: Dict[str, torch.Tensor] = {},
+    outpath="./lora.safetensors",
+):
+    """
+    Converts the Lora from multiple pytorch .pt files into a single safetensor file.
+    modelmap is a dictionary of {
+        "module name": (pytorch_model_path, target_replace_module, rank)
+    }
+    """
+    weights = {}
+    metadata = {}
+    for name, (path, target_replace_module, r) in modelmap.items():
+        metadata[name] = json.dumps(list(target_replace_module))
+        lora = torch.load(path)
+        for i, weight in enumerate(lora):
+            is_up = i % 2 == 0
+            i = i // 2
+            if is_up:
+                metadata[f"{name}:{i}:rank"] = str(r)
+                weights[f"{name}:{i}:up"] = weight
+            else:
+                weights[f"{name}:{i}:down"] = weight
+    for token, tensor in embeds.items():
+        metadata[token] = EMBED_FLAG
+        weights[token] = tensor
+    print(f"Saving weights to {outpath}")
+    safe_save(weights, outpath, metadata)
+def convert_loras_to_safeloras(
+    modelmap: Dict[str, Tuple[str, Set[str], int]] = {},
+    outpath="./lora.safetensors",
+):
+    convert_loras_to_safeloras_with_embeds(modelmap=modelmap, outpath=outpath)
+def parse_safeloras(
+    safeloras,
+) -> Dict[str, Tuple[List[nn.parameter.Parameter], List[int], List[str]]]:
+    """
+    Converts a loaded safetensor file that contains a set of module Loras
+    into Parameters and other information
+    Output is a dictionary of {
+        "module name": (
+            [list of weights],
+            [list of ranks],
+            target_replacement_modules
+        )
+    }
+    """
+    loras = {}
+    metadata = safeloras.metadata()
+    get_name = lambda k: k.split(":")[0]
+    keys = list(safeloras.keys())
+    keys.sort(key=get_name)
+    for name, module_keys in groupby(keys, get_name):
+        info = metadata.get(name)
+        if not info:
+            raise ValueError(
+                f"Tensor {name} has no metadata - is this a Lora safetensor?"
+            )
+        # Skip Textual Inversion embeds
+        if info == EMBED_FLAG:
+            continue
+        # Handle Loras
+        # Extract the targets
+        target = json.loads(info)
+        # Build the result lists - Python needs us to preallocate lists to insert into them
+        module_keys = list(module_keys)
+        ranks = [4] * (len(module_keys) // 2)
+        weights = [None] * len(module_keys)
+        for key in module_keys:
+            # Split the model name and index out of the key
+            _, idx, direction = key.split(":")
+            idx = int(idx)
+            # Add the rank
+            ranks[idx] = int(metadata[f"{name}:{idx}:rank"])
+            # Insert the weight into the list
+            idx = idx * 2 + (1 if direction == "down" else 0)
+            weights[idx] = nn.parameter.Parameter(safeloras.get_tensor(key))
+        loras[name] = (weights, ranks, target)
+    return loras
+def parse_safeloras_embeds(
+    safeloras,
+) -> Dict[str, torch.Tensor]:
+    """
+    Converts a loaded safetensor file that contains Textual Inversion embeds into
+    a dictionary of embed_token: Tensor
+    """
+    embeds = {}
+    metadata = safeloras.metadata()
+    for key in safeloras.keys():
+        # Only handle Textual Inversion embeds
+        meta = metadata.get(key)
+        if not meta or meta != EMBED_FLAG:
+            continue
+        embeds[key] = safeloras.get_tensor(key)
+    return embeds
+def load_safeloras(path, device="cpu"):
+    safeloras = safe_open(path, framework="pt", device=device)
+    return parse_safeloras(safeloras)
+def load_safeloras_embeds(path, device="cpu"):
+    safeloras = safe_open(path, framework="pt", device=device)
+    return parse_safeloras_embeds(safeloras)
+def load_safeloras_both(path, device="cpu"):
+    safeloras = safe_open(path, framework="pt", device=device)
+    return parse_safeloras(safeloras), parse_safeloras_embeds(safeloras)
+def collapse_lora(model, alpha=1.0):
+    for _module, name, _child_module in _find_modules(
+        model,
+        UNET_EXTENDED_TARGET_REPLACE | TEXT_ENCODER_EXTENDED_TARGET_REPLACE,
+        search_class=[LoraInjectedLinear, LoraInjectedConv2d],
+    ):
+        if isinstance(_child_module, LoraInjectedLinear):
+            print("Collapsing Lin Lora in", name)
+            _child_module.linear.weight = nn.Parameter(
+                _child_module.linear.weight.data
+                + alpha
+                * (
+                    _child_module.lora_up.weight.data
+                    @ _child_module.lora_down.weight.data
+                )
+                .type(_child_module.linear.weight.dtype)
+                .to(_child_module.linear.weight.device)
+            )
+        else:
+            print("Collapsing Conv Lora in", name)
+            _child_module.conv.weight = nn.Parameter(
+                _child_module.conv.weight.data
+                + alpha
+                * (
+                    _child_module.lora_up.weight.data.flatten(start_dim=1)
+                    @ _child_module.lora_down.weight.data.flatten(start_dim=1)
+                )
+                .reshape(_child_module.conv.weight.data.shape)
+                .type(_child_module.conv.weight.dtype)
+                .to(_child_module.conv.weight.device)
+            )
+def monkeypatch_or_replace_lora(
+    model,
+    loras,
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    r: Union[int, List[int]] = 4,
+):
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[nn.Linear, LoraInjectedLinear]
+    ):
+        _source = (
+            _child_module.linear
+            if isinstance(_child_module, LoraInjectedLinear)
+            else _child_module
+        )
+        weight = _source.weight
+        bias = _source.bias
+        _tmp = LoraInjectedLinear(
+            _source.in_features,
+            _source.out_features,
+            _source.bias is not None,
+            r=r.pop(0) if isinstance(r, list) else r,
+        )
+        _tmp.linear.weight = weight
+        if bias is not None:
+            _tmp.linear.bias = bias
+        # switch the module
+        _module._modules[name] = _tmp
+        up_weight = loras.pop(0)
+        down_weight = loras.pop(0)
+        _module._modules[name].lora_up.weight = nn.Parameter(
+            up_weight.type(weight.dtype)
+        )
+        _module._modules[name].lora_down.weight = nn.Parameter(
+            down_weight.type(weight.dtype)
+        )
+        _module._modules[name].to(weight.device)
+def monkeypatch_or_replace_lora_extended(
+    model,
+    loras,
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    r: Union[int, List[int]] = 4,
+):
+    for _module, name, _child_module in _find_modules(
+        model,
+        target_replace_module,
+        search_class=[nn.Linear, LoraInjectedLinear, nn.Conv2d, LoraInjectedConv2d],
+    ):
+        if (_child_module.__class__ == nn.Linear) or (
+            _child_module.__class__ == LoraInjectedLinear
+        ):
+            if len(loras[0].shape) != 2:
+                continue
+            _source = (
+                _child_module.linear
+                if isinstance(_child_module, LoraInjectedLinear)
+                else _child_module
+            )
+            weight = _source.weight
+            bias = _source.bias
+            _tmp = LoraInjectedLinear(
+                _source.in_features,
+                _source.out_features,
+                _source.bias is not None,
+                r=r.pop(0) if isinstance(r, list) else r,
+            )
+            _tmp.linear.weight = weight
+            if bias is not None:
+                _tmp.linear.bias = bias
+        elif (_child_module.__class__ == nn.Conv2d) or (
+            _child_module.__class__ == LoraInjectedConv2d
+        ):
+            if len(loras[0].shape) != 4:
+                continue
+            _source = (
+                _child_module.conv
+                if isinstance(_child_module, LoraInjectedConv2d)
+                else _child_module
+            )
+            weight = _source.weight
+            bias = _source.bias
+            _tmp = LoraInjectedConv2d(
+                _source.in_channels,
+                _source.out_channels,
+                _source.kernel_size,
+                _source.stride,
+                _source.padding,
+                _source.dilation,
+                _source.groups,
+                _source.bias is not None,
+                r=r.pop(0) if isinstance(r, list) else r,
+            )
+            _tmp.conv.weight = weight
+            if bias is not None:
+                _tmp.conv.bias = bias
+        # switch the module
+        _module._modules[name] = _tmp
+        up_weight = loras.pop(0)
+        down_weight = loras.pop(0)
+        _module._modules[name].lora_up.weight = nn.Parameter(
+            up_weight.type(weight.dtype)
+        )
+        _module._modules[name].lora_down.weight = nn.Parameter(
+            down_weight.type(weight.dtype)
+        )
+        _module._modules[name].to(weight.device)
+def monkeypatch_or_replace_safeloras(models, safeloras):
+    loras = parse_safeloras(safeloras)
+    for name, (lora, ranks, target) in loras.items():
+        model = getattr(models, name, None)
+        if not model:
+            print(f"No model provided for {name}, contained in Lora")
+            continue
+        monkeypatch_or_replace_lora_extended(model, lora, target, ranks)
+def monkeypatch_remove_lora(model):
+    for _module, name, _child_module in _find_modules(
+        model, search_class=[LoraInjectedLinear, LoraInjectedConv2d]
+    ):
+        if isinstance(_child_module, LoraInjectedLinear):
+            _source = _child_module.linear
+            weight, bias = _source.weight, _source.bias
+            _tmp = nn.Linear(
+                _source.in_features, _source.out_features, bias is not None
+            )
+            _tmp.weight = weight
+            if bias is not None:
+                _tmp.bias = bias
+        else:
+            _source = _child_module.conv
+            weight, bias = _source.weight, _source.bias
+            _tmp = nn.Conv2d(
+                in_channels=_source.in_channels,
+                out_channels=_source.out_channels,
+                kernel_size=_source.kernel_size,
+                stride=_source.stride,
+                padding=_source.padding,
+                dilation=_source.dilation,
+                groups=_source.groups,
+                bias=bias is not None,
+            )
+            _tmp.weight = weight
+            if bias is not None:
+                _tmp.bias = bias
+        _module._modules[name] = _tmp
+def monkeypatch_add_lora(
+    model,
+    loras,
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    alpha: float = 1.0,
+    beta: float = 1.0,
+):
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[LoraInjectedLinear]
+    ):
+        weight = _child_module.linear.weight
+        up_weight = loras.pop(0)
+        down_weight = loras.pop(0)
+        _module._modules[name].lora_up.weight = nn.Parameter(
+            up_weight.type(weight.dtype).to(weight.device) * alpha
+            + _module._modules[name].lora_up.weight.to(weight.device) * beta
+        )
+        _module._modules[name].lora_down.weight = nn.Parameter(
+            down_weight.type(weight.dtype).to(weight.device) * alpha
+            + _module._modules[name].lora_down.weight.to(weight.device) * beta
+        )
+        _module._modules[name].to(weight.device)
+def tune_lora_scale(model, alpha: float = 1.0):
+    for _module in model.modules():
+        if _module.__class__.__name__ in ["LoraInjectedLinear", "LoraInjectedConv2d"]:
+            _module.scale = alpha
+def set_lora_diag(model, diag: torch.Tensor):
+    for _module in model.modules():
+        if _module.__class__.__name__ in ["LoraInjectedLinear", "LoraInjectedConv2d"]:
+            _module.set_selector_from_diag(diag)
+def _text_lora_path(path: str) -> str:
+    assert path.endswith(".pt"), "Only .pt files are supported"
+    return ".".join(path.split(".")[:-1] + ["text_encoder", "pt"])
+def _ti_lora_path(path: str) -> str:
+    assert path.endswith(".pt"), "Only .pt files are supported"
+    return ".".join(path.split(".")[:-1] + ["ti", "pt"])
+def apply_learned_embed_in_clip(
+    learned_embeds,
+    text_encoder,
+    tokenizer,
+    token: Optional[Union[str, List[str]]] = None,
+    idempotent=False,
+):
+    if isinstance(token, str):
+        trained_tokens = [token]
+    elif isinstance(token, list):
+        assert len(learned_embeds.keys()) == len(
+            token
+        ), "The number of tokens and the number of embeds should be the same"
+        trained_tokens = token
+    else:
+        trained_tokens = list(learned_embeds.keys())
+    for token in trained_tokens:
+        print(token)
+        embeds = learned_embeds[token]
+        # cast to dtype of text_encoder
+        dtype = text_encoder.get_input_embeddings().weight.dtype
+        num_added_tokens = tokenizer.add_tokens(token)
+        i = 1
+        if not idempotent:
+            while num_added_tokens == 0:
+                print(f"The tokenizer already contains the token {token}.")
+                token = f"{token[:-1]}-{i}>"
+                print(f"Attempting to add the token {token}.")
+                num_added_tokens = tokenizer.add_tokens(token)
+                i += 1
+        elif num_added_tokens == 0 and idempotent:
+            print(f"The tokenizer already contains the token {token}.")
+            print(f"Replacing {token} embedding.")
+        # resize the token embeddings
+        text_encoder.resize_token_embeddings(len(tokenizer))
+        # get the id for the token and assign the embeds
+        token_id = tokenizer.convert_tokens_to_ids(token)
+        text_encoder.get_input_embeddings().weight.data[token_id] = embeds
+    return token
+def load_learned_embed_in_clip(
+    learned_embeds_path,
+    text_encoder,
+    tokenizer,
+    token: Optional[Union[str, List[str]]] = None,
+    idempotent=False,
+):
+    learned_embeds = torch.load(learned_embeds_path)
+    apply_learned_embed_in_clip(
+        learned_embeds, text_encoder, tokenizer, token, idempotent
+    )
+def patch_pipe(
+    pipe,
+    maybe_unet_path,
+    token: Optional[str] = None,
+    r: int = 4,
+    patch_unet=True,
+    patch_text=True,
+    patch_ti=True,
+    idempotent_token=True,
+    unet_target_replace_module=DEFAULT_TARGET_REPLACE,
+    text_target_replace_module=TEXT_ENCODER_DEFAULT_TARGET_REPLACE,
+):
+    if maybe_unet_path.endswith(".pt"):
+        # torch format
+        if maybe_unet_path.endswith(".ti.pt"):
+            unet_path = maybe_unet_path[:-6] + ".pt"
+        elif maybe_unet_path.endswith(".text_encoder.pt"):
+            unet_path = maybe_unet_path[:-16] + ".pt"
+        else:
+            unet_path = maybe_unet_path
+        ti_path = _ti_lora_path(unet_path)
+        text_path = _text_lora_path(unet_path)
+        if patch_unet:
+            print("LoRA : Patching Unet")
+            monkeypatch_or_replace_lora(
+                pipe.unet,
+                torch.load(unet_path),
+                r=r,
+                target_replace_module=unet_target_replace_module,
+            )
+        if patch_text:
+            print("LoRA : Patching text encoder")
+            monkeypatch_or_replace_lora(
+                pipe.text_encoder,
+                torch.load(text_path),
+                target_replace_module=text_target_replace_module,
+                r=r,
+            )
+        if patch_ti:
+            print("LoRA : Patching token input")
+            token = load_learned_embed_in_clip(
+                ti_path,
+                pipe.text_encoder,
+                pipe.tokenizer,
+                token=token,
+                idempotent=idempotent_token,
+            )
+    elif maybe_unet_path.endswith(".safetensors"):
+        safeloras = safe_open(maybe_unet_path, framework="pt", device="cpu")
+        monkeypatch_or_replace_safeloras(pipe, safeloras)
+        tok_dict = parse_safeloras_embeds(safeloras)
+        if patch_ti:
+            apply_learned_embed_in_clip(
+                tok_dict,
+                pipe.text_encoder,
+                pipe.tokenizer,
+                token=token,
+                idempotent=idempotent_token,
+            )
+        return tok_dict
+@torch.no_grad()
+def inspect_lora(model):
+    moved = {}
+    for name, _module in model.named_modules():
+        if _module.__class__.__name__ in ["LoraInjectedLinear", "LoraInjectedConv2d"]:
+            ups = _module.lora_up.weight.data.clone()
+            downs = _module.lora_down.weight.data.clone()
+            wght: torch.Tensor = ups.flatten(1) @ downs.flatten(1)
+            dist = wght.flatten().abs().mean().item()
+            if name in moved:
+                moved[name].append(dist)
+            else:
+                moved[name] = [dist]
+    return moved
+def save_all(
+    unet,
+    text_encoder,
+    save_path,
+    placeholder_token_ids=None,
+    placeholder_tokens=None,
+    save_lora=True,
+    save_ti=True,
+    target_replace_module_text=TEXT_ENCODER_DEFAULT_TARGET_REPLACE,
+    target_replace_module_unet=DEFAULT_TARGET_REPLACE,
+    safe_form=True,
+):
+    if not safe_form:
+        # save ti
+        if save_ti:
+            ti_path = _ti_lora_path(save_path)
+            learned_embeds_dict = {}
+            for tok, tok_id in zip(placeholder_tokens, placeholder_token_ids):
+                learned_embeds = text_encoder.get_input_embeddings().weight[tok_id]
+                print(
+                    f"Current Learned Embeddings for {tok}:, id {tok_id} ",
+                    learned_embeds[:4],
+                )
+                learned_embeds_dict[tok] = learned_embeds.detach().cpu()
+            torch.save(learned_embeds_dict, ti_path)
+            print("Ti saved to ", ti_path)
+        # save text encoder
+        if save_lora:
+            save_lora_weight(
+                unet, save_path, target_replace_module=target_replace_module_unet
+            )
+            print("Unet saved to ", save_path)
+            save_lora_weight(
+                text_encoder,
+                _text_lora_path(save_path),
+                target_replace_module=target_replace_module_text,
+            )
+            print("Text Encoder saved to ", _text_lora_path(save_path))
+    else:
+        assert save_path.endswith(
+            ".safetensors"
+        ), f"Save path : {save_path} should end with .safetensors"
+        loras = {}
+        embeds = {}
+        if save_lora:
+            loras["unet"] = (unet, target_replace_module_unet)
+            loras["text_encoder"] = (text_encoder, target_replace_module_text)
+        if save_ti:
+            for tok, tok_id in zip(placeholder_tokens, placeholder_token_ids):
+                learned_embeds = text_encoder.get_input_embeddings().weight[tok_id]
+                print(
+                    f"Current Learned Embeddings for {tok}:, id {tok_id} ",
+                    learned_embeds[:4],
+                )
+                embeds[tok] = learned_embeds.detach().cpu()
+        save_safeloras_with_embeds(loras, embeds, save_path)

lora_diffusion/lora_manager.py ADDED Viewed

	@@ -0,0 +1,144 @@

+from typing import List
+import torch
+from safetensors import safe_open
+from diffusers import StableDiffusionPipeline
+from .lora import (
+    monkeypatch_or_replace_safeloras,
+    apply_learned_embed_in_clip,
+    set_lora_diag,
+    parse_safeloras_embeds,
+)
+def lora_join(lora_safetenors: list):
+    metadatas = [dict(safelora.metadata()) for safelora in lora_safetenors]
+    _total_metadata = {}
+    total_metadata = {}
+    total_tensor = {}
+    total_rank = 0
+    ranklist = []
+    for _metadata in metadatas:
+        rankset = []
+        for k, v in _metadata.items():
+            if k.endswith("rank"):
+                rankset.append(int(v))
+        assert len(set(rankset)) <= 1, "Rank should be the same per model"
+        if len(rankset) == 0:
+            rankset = [0]
+        total_rank += rankset[0]
+        _total_metadata.update(_metadata)
+        ranklist.append(rankset[0])
+    # remove metadata about tokens
+    for k, v in _total_metadata.items():
+        if v != "<embed>":
+            total_metadata[k] = v
+    tensorkeys = set()
+    for safelora in lora_safetenors:
+        tensorkeys.update(safelora.keys())
+    for keys in tensorkeys:
+        if keys.startswith("text_encoder") or keys.startswith("unet"):
+            tensorset = [safelora.get_tensor(keys) for safelora in lora_safetenors]
+            is_down = keys.endswith("down")
+            if is_down:
+                _tensor = torch.cat(tensorset, dim=0)
+                assert _tensor.shape[0] == total_rank
+            else:
+                _tensor = torch.cat(tensorset, dim=1)
+                assert _tensor.shape[1] == total_rank
+            total_tensor[keys] = _tensor
+            keys_rank = ":".join(keys.split(":")[:-1]) + ":rank"
+            total_metadata[keys_rank] = str(total_rank)
+    token_size_list = []
+    for idx, safelora in enumerate(lora_safetenors):
+        tokens = [k for k, v in safelora.metadata().items() if v == "<embed>"]
+        for jdx, token in enumerate(sorted(tokens)):
+            total_tensor[f"<s{idx}-{jdx}>"] = safelora.get_tensor(token)
+            total_metadata[f"<s{idx}-{jdx}>"] = "<embed>"
+            print(f"Embedding {token} replaced to <s{idx}-{jdx}>")
+        token_size_list.append(len(tokens))
+    return total_tensor, total_metadata, ranklist, token_size_list
+class DummySafeTensorObject:
+    def __init__(self, tensor: dict, metadata):
+        self.tensor = tensor
+        self._metadata = metadata
+    def keys(self):
+        return self.tensor.keys()
+    def metadata(self):
+        return self._metadata
+    def get_tensor(self, key):
+        return self.tensor[key]
+class LoRAManager:
+    def __init__(self, lora_paths_list: List[str], pipe: StableDiffusionPipeline):
+        self.lora_paths_list = lora_paths_list
+        self.pipe = pipe
+        self._setup()
+    def _setup(self):
+        self._lora_safetenors = [
+            safe_open(path, framework="pt", device="cpu")
+            for path in self.lora_paths_list
+        ]
+        (
+            total_tensor,
+            total_metadata,
+            self.ranklist,
+            self.token_size_list,
+        ) = lora_join(self._lora_safetenors)
+        self.total_safelora = DummySafeTensorObject(total_tensor, total_metadata)
+        monkeypatch_or_replace_safeloras(self.pipe, self.total_safelora)
+        tok_dict = parse_safeloras_embeds(self.total_safelora)
+        apply_learned_embed_in_clip(
+            tok_dict,
+            self.pipe.text_encoder,
+            self.pipe.tokenizer,
+            token=None,
+            idempotent=True,
+        )
+    def tune(self, scales):
+        assert len(scales) == len(
+            self.ranklist
+        ), "Scale list should be the same length as ranklist"
+        diags = []
+        for scale, rank in zip(scales, self.ranklist):
+            diags = diags + [scale] * rank
+        set_lora_diag(self.pipe.unet, torch.tensor(diags))
+    def prompt(self, prompt):
+        if prompt is not None:
+            for idx, tok_size in enumerate(self.token_size_list):
+                prompt = prompt.replace(
+                    f"<{idx + 1}>",
+                    "".join([f"<s{idx}-{jdx}>" for jdx in range(tok_size)]),
+                )
+        # TODO : Rescale LoRA + Text inputs based on prompt scale params
+        return prompt

lora_diffusion/preprocess_files.py ADDED Viewed

	@@ -0,0 +1,327 @@

+# Have SwinIR upsample
+# Have BLIP auto caption
+# Have CLIPSeg auto mask concept
+from typing import List, Literal, Union, Optional, Tuple
+import os
+from PIL import Image, ImageFilter
+import torch
+import numpy as np
+import fire
+from tqdm import tqdm
+import glob
+from transformers import CLIPSegProcessor, CLIPSegForImageSegmentation
+@torch.no_grad()
+def swin_ir_sr(
+    images: List[Image.Image],
+    model_id: Literal[
+        "caidas/swin2SR-classical-sr-x2-64", "caidas/swin2SR-classical-sr-x4-48"
+    ] = "caidas/swin2SR-classical-sr-x2-64",
+    target_size: Optional[Tuple[int, int]] = None,
+    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
+    **kwargs,
+) -> List[Image.Image]:
+    """
+    Upscales images using SwinIR. Returns a list of PIL images.
+    """
+    # So this is currently in main branch, so this can be used in the future I guess?
+    from transformers import Swin2SRForImageSuperResolution, Swin2SRImageProcessor
+    model = Swin2SRForImageSuperResolution.from_pretrained(
+        model_id,
+    ).to(device)
+    processor = Swin2SRImageProcessor()
+    out_images = []
+    for image in tqdm(images):
+        ori_w, ori_h = image.size
+        if target_size is not None:
+            if ori_w >= target_size[0] and ori_h >= target_size[1]:
+                out_images.append(image)
+                continue
+        inputs = processor(image, return_tensors="pt").to(device)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        output = (
+            outputs.reconstruction.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+        )
+        output = np.moveaxis(output, source=0, destination=-1)
+        output = (output * 255.0).round().astype(np.uint8)
+        output = Image.fromarray(output)
+        out_images.append(output)
+    return out_images
+@torch.no_grad()
+def clipseg_mask_generator(
+    images: List[Image.Image],
+    target_prompts: Union[List[str], str],
+    model_id: Literal[
+        "CIDAS/clipseg-rd64-refined", "CIDAS/clipseg-rd16"
+    ] = "CIDAS/clipseg-rd64-refined",
+    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
+    bias: float = 0.01,
+    temp: float = 1.0,
+    **kwargs,
+) -> List[Image.Image]:
+    """
+    Returns a greyscale mask for each image, where the mask is the probability of the target prompt being present in the image
+    """
+    if isinstance(target_prompts, str):
+        print(
+            f'Warning: only one target prompt "{target_prompts}" was given, so it will be used for all images'
+        )
+        target_prompts = [target_prompts] * len(images)
+    processor = CLIPSegProcessor.from_pretrained(model_id)
+    model = CLIPSegForImageSegmentation.from_pretrained(model_id).to(device)
+    masks = []
+    for image, prompt in tqdm(zip(images, target_prompts)):
+        original_size = image.size
+        inputs = processor(
+            text=[prompt, ""],
+            images=[image] * 2,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        ).to(device)
+        outputs = model(**inputs)
+        logits = outputs.logits
+        probs = torch.nn.functional.softmax(logits / temp, dim=0)[0]
+        probs = (probs + bias).clamp_(0, 1)
+        probs = 255 * probs / probs.max()
+        # make mask greyscale
+        mask = Image.fromarray(probs.cpu().numpy()).convert("L")
+        # resize mask to original size
+        mask = mask.resize(original_size)
+        masks.append(mask)
+    return masks
+@torch.no_grad()
+def blip_captioning_dataset(
+    images: List[Image.Image],
+    text: Optional[str] = None,
+    model_id: Literal[
+        "Salesforce/blip-image-captioning-large",
+        "Salesforce/blip-image-captioning-base",
+    ] = "Salesforce/blip-image-captioning-large",
+    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
+    **kwargs,
+) -> List[str]:
+    """
+    Returns a list of captions for the given images
+    """
+    from transformers import BlipProcessor, BlipForConditionalGeneration
+    processor = BlipProcessor.from_pretrained(model_id)
+    model = BlipForConditionalGeneration.from_pretrained(model_id).to(device)
+    captions = []
+    for image in tqdm(images):
+        inputs = processor(image, text=text, return_tensors="pt").to("cuda")
+        out = model.generate(
+            **inputs, max_length=150, do_sample=True, top_k=50, temperature=0.7
+        )
+        caption = processor.decode(out[0], skip_special_tokens=True)
+        captions.append(caption)
+    return captions
+def face_mask_google_mediapipe(
+    images: List[Image.Image], blur_amount: float = 80.0, bias: float = 0.05
+) -> List[Image.Image]:
+    """
+    Returns a list of images with mask on the face parts.
+    """
+    import mediapipe as mp
+    mp_face_detection = mp.solutions.face_detection
+    face_detection = mp_face_detection.FaceDetection(
+        model_selection=1, min_detection_confidence=0.5
+    )
+    masks = []
+    for image in tqdm(images):
+        image = np.array(image)
+        results = face_detection.process(image)
+        black_image = np.ones((image.shape[0], image.shape[1]), dtype=np.uint8)
+        if results.detections:
+            for detection in results.detections:
+                x_min = int(
+                    detection.location_data.relative_bounding_box.xmin * image.shape[1]
+                )
+                y_min = int(
+                    detection.location_data.relative_bounding_box.ymin * image.shape[0]
+                )
+                width = int(
+                    detection.location_data.relative_bounding_box.width * image.shape[1]
+                )
+                height = int(
+                    detection.location_data.relative_bounding_box.height
+                    * image.shape[0]
+                )
+                # draw the colored rectangle
+                black_image[y_min : y_min + height, x_min : x_min + width] = 255
+        black_image = Image.fromarray(black_image)
+        masks.append(black_image)
+    return masks
+def _crop_to_square(
+    image: Image.Image, com: List[Tuple[int, int]], resize_to: Optional[int] = None
+):
+    cx, cy = com
+    width, height = image.size
+    if width > height:
+        left_possible = max(cx - height / 2, 0)
+        left = min(left_possible, width - height)
+        right = left + height
+        top = 0
+        bottom = height
+    else:
+        left = 0
+        right = width
+        top_possible = max(cy - width / 2, 0)
+        top = min(top_possible, height - width)
+        bottom = top + width
+    image = image.crop((left, top, right, bottom))
+    if resize_to:
+        image = image.resize((resize_to, resize_to), Image.Resampling.LANCZOS)
+    return image
+def _center_of_mass(mask: Image.Image):
+    """
+    Returns the center of mass of the mask
+    """
+    x, y = np.meshgrid(np.arange(mask.size[0]), np.arange(mask.size[1]))
+    x_ = x * np.array(mask)
+    y_ = y * np.array(mask)
+    x = np.sum(x_) / np.sum(mask)
+    y = np.sum(y_) / np.sum(mask)
+    return x, y
+def load_and_save_masks_and_captions(
+    files: Union[str, List[str]],
+    output_dir: str,
+    caption_text: Optional[str] = None,
+    target_prompts: Optional[Union[List[str], str]] = None,
+    target_size: int = 512,
+    crop_based_on_salience: bool = True,
+    use_face_detection_instead: bool = False,
+    temp: float = 1.0,
+    n_length: int = -1,
+):
+    """
+    Loads images from the given files, generates masks for them, and saves the masks and captions and upscale images
+    to output dir.
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    # load images
+    if isinstance(files, str):
+        # check if it is a directory
+        if os.path.isdir(files):
+            # get all the .png .jpg in the directory
+            files = glob.glob(os.path.join(files, "*.png")) + glob.glob(
+                os.path.join(files, "*.jpg")
+            )
+        if len(files) == 0:
+            raise Exception(
+                f"No files found in {files}. Either {files} is not a directory or it does not contain any .png or .jpg files."
+            )
+        if n_length == -1:
+            n_length = len(files)
+        files = sorted(files)[:n_length]
+    images = [Image.open(file) for file in files]
+    # captions
+    print(f"Generating {len(images)} captions...")
+    captions = blip_captioning_dataset(images, text=caption_text)
+    if target_prompts is None:
+        target_prompts = captions
+    print(f"Generating {len(images)} masks...")
+    if not use_face_detection_instead:
+        seg_masks = clipseg_mask_generator(
+            images=images, target_prompts=target_prompts, temp=temp
+        )
+    else:
+        seg_masks = face_mask_google_mediapipe(images=images)
+    # find the center of mass of the mask
+    if crop_based_on_salience:
+        coms = [_center_of_mass(mask) for mask in seg_masks]
+    else:
+        coms = [(image.size[0] / 2, image.size[1] / 2) for image in images]
+    # based on the center of mass, crop the image to a square
+    images = [
+        _crop_to_square(image, com, resize_to=None) for image, com in zip(images, coms)
+    ]
+    print(f"Upscaling {len(images)} images...")
+    # upscale images anyways
+    images = swin_ir_sr(images, target_size=(target_size, target_size))
+    images = [
+        image.resize((target_size, target_size), Image.Resampling.LANCZOS)
+        for image in images
+    ]
+    seg_masks = [
+        _crop_to_square(mask, com, resize_to=target_size)
+        for mask, com in zip(seg_masks, coms)
+    ]
+    with open(os.path.join(output_dir, "caption.txt"), "w") as f:
+        # save images and masks
+        for idx, (image, mask, caption) in enumerate(zip(images, seg_masks, captions)):
+            image.save(os.path.join(output_dir, f"{idx}.src.jpg"), quality=99)
+            mask.save(os.path.join(output_dir, f"{idx}.mask.png"))
+            f.write(caption + "\n")
+def main():
+    fire.Fire(load_and_save_masks_and_captions)

lora_diffusion/safe_open.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+Pure python version of Safetensors safe_open
+From https://gist.github.com/Narsil/3edeec2669a5e94e4707aa0f901d2282
+"""
+import json
+import mmap
+import os
+import torch
+class SafetensorsWrapper:
+    def __init__(self, metadata, tensors):
+        self._metadata = metadata
+        self._tensors = tensors
+    def metadata(self):
+        return self._metadata
+    def keys(self):
+        return self._tensors.keys()
+    def get_tensor(self, k):
+        return self._tensors[k]
+DTYPES = {
+    "F32": torch.float32,
+    "F16": torch.float16,
+    "BF16": torch.bfloat16,
+}
+def create_tensor(storage, info, offset):
+    dtype = DTYPES[info["dtype"]]
+    shape = info["shape"]
+    start, stop = info["data_offsets"]
+    return (
+        torch.asarray(storage[start + offset : stop + offset], dtype=torch.uint8)
+        .view(dtype=dtype)
+        .reshape(shape)
+    )
+def safe_open(filename, framework="pt", device="cpu"):
+    if framework != "pt":
+        raise ValueError("`framework` must be 'pt'")
+    with open(filename, mode="r", encoding="utf8") as file_obj:
+        with mmap.mmap(file_obj.fileno(), length=0, access=mmap.ACCESS_READ) as m:
+            header = m.read(8)
+            n = int.from_bytes(header, "little")
+            metadata_bytes = m.read(n)
+            metadata = json.loads(metadata_bytes)
+    size = os.stat(filename).st_size
+    storage = torch.ByteStorage.from_file(filename, shared=False, size=size).untyped()
+    offset = n + 8
+    return SafetensorsWrapper(
+        metadata=metadata.get("__metadata__", {}),
+        tensors={
+            name: create_tensor(storage, info, offset).to(device)
+            for name, info in metadata.items()
+            if name != "__metadata__"
+        },
+    )

lora_diffusion/to_ckpt_v2.py ADDED Viewed

	@@ -0,0 +1,232 @@

+# from https://gist.github.com/jachiam/8a5c0b607e38fcc585168b90c686eb05
+# Script for converting a HF Diffusers saved pipeline to a Stable Diffusion checkpoint.
+# *Only* converts the UNet, VAE, and Text Encoder.
+# Does not convert optimizer state or any other thing.
+# Written by jachiam
+import argparse
+import os.path as osp
+import torch
+# =================#
+# UNet Conversion #
+# =================#
+unet_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+    ("input_blocks.0.0.weight", "conv_in.weight"),
+    ("input_blocks.0.0.bias", "conv_in.bias"),
+    ("out.0.weight", "conv_norm_out.weight"),
+    ("out.0.bias", "conv_norm_out.bias"),
+    ("out.2.weight", "conv_out.weight"),
+    ("out.2.bias", "conv_out.bias"),
+]
+unet_conversion_map_resnet = [
+    # (stable-diffusion, HF Diffusers)
+    ("in_layers.0", "norm1"),
+    ("in_layers.2", "conv1"),
+    ("out_layers.0", "norm2"),
+    ("out_layers.3", "conv2"),
+    ("emb_layers.1", "time_emb_proj"),
+    ("skip_connection", "conv_shortcut"),
+]
+unet_conversion_map_layer = []
+# hardcoded number of downblocks and resnets/attentions...
+# would need smarter logic for other networks.
+for i in range(4):
+    # loop over downblocks/upblocks
+    for j in range(2):
+        # loop over resnets/attentions for downblocks
+        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+        sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+        if i < 3:
+            # no attention layers in down_blocks.3
+            hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+            sd_down_atn_prefix = f"input_blocks.{3*i + j + 1}.1."
+            unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+    for j in range(3):
+        # loop over resnets/attentions for upblocks
+        hf_up_res_prefix = f"up_blocks.{i}.resnets.{j}."
+        sd_up_res_prefix = f"output_blocks.{3*i + j}.0."
+        unet_conversion_map_layer.append((sd_up_res_prefix, hf_up_res_prefix))
+        if i > 0:
+            # no attention layers in up_blocks.0
+            hf_up_atn_prefix = f"up_blocks.{i}.attentions.{j}."
+            sd_up_atn_prefix = f"output_blocks.{3*i + j}.1."
+            unet_conversion_map_layer.append((sd_up_atn_prefix, hf_up_atn_prefix))
+    if i < 3:
+        # no downsample in down_blocks.3
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+        sd_downsample_prefix = f"input_blocks.{3*(i+1)}.0.op."
+        unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+        # no upsample in up_blocks.3
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"output_blocks.{3*i + 2}.{1 if i == 0 else 2}."
+        unet_conversion_map_layer.append((sd_upsample_prefix, hf_upsample_prefix))
+hf_mid_atn_prefix = "mid_block.attentions.0."
+sd_mid_atn_prefix = "middle_block.1."
+unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+for j in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{j}."
+    sd_mid_res_prefix = f"middle_block.{2*j}."
+    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+def convert_unet_state_dict(unet_state_dict):
+    # buyer beware: this is a *brittle* function,
+    # and correct output requires that all of these pieces interact in
+    # the exact order in which I have arranged them.
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items()}
+    return new_state_dict
+# ================#
+# VAE Conversion #
+# ================#
+vae_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("nin_shortcut", "conv_shortcut"),
+    ("norm_out", "conv_norm_out"),
+    ("mid.attn_1.", "mid_block.attentions.0."),
+]
+for i in range(4):
+    # down_blocks have two resnets
+    for j in range(2):
+        hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
+        sd_down_prefix = f"encoder.down.{i}.block.{j}."
+        vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
+    if i < 3:
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
+        sd_downsample_prefix = f"down.{i}.downsample."
+        vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"up.{3-i}.upsample."
+        vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
+    # up_blocks have three resnets
+    # also, up blocks in hf are numbered in reverse from sd
+    for j in range(3):
+        hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
+        sd_up_prefix = f"decoder.up.{3-i}.block.{j}."
+        vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
+# this part accounts for mid blocks in both the encoder and the decoder
+for i in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{i}."
+    sd_mid_res_prefix = f"mid.block_{i+1}."
+    vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
+vae_conversion_map_attn = [
+    # (stable-diffusion, HF Diffusers)
+    ("norm.", "group_norm."),
+    ("q.", "query."),
+    ("k.", "key."),
+    ("v.", "value."),
+    ("proj_out.", "proj_attn."),
+]
+def reshape_weight_for_sd(w):
+    # convert HF linear weights to SD conv2d weights
+    return w.reshape(*w.shape, 1, 1)
+def convert_vae_state_dict(vae_state_dict):
+    mapping = {k: k for k in vae_state_dict.keys()}
+    for k, v in mapping.items():
+        for sd_part, hf_part in vae_conversion_map:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    for k, v in mapping.items():
+        if "attentions" in k:
+            for sd_part, hf_part in vae_conversion_map_attn:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
+    weights_to_convert = ["q", "k", "v", "proj_out"]
+    for k, v in new_state_dict.items():
+        for weight_name in weights_to_convert:
+            if f"mid.attn_1.{weight_name}.weight" in k:
+                print(f"Reshaping {k} for SD format")
+                new_state_dict[k] = reshape_weight_for_sd(v)
+    return new_state_dict
+# =========================#
+# Text Encoder Conversion #
+# =========================#
+# pretty much a no-op
+def convert_text_enc_state_dict(text_enc_dict):
+    return text_enc_dict
+def convert_to_ckpt(model_path, checkpoint_path, as_half):
+    assert model_path is not None, "Must provide a model path!"
+    assert checkpoint_path is not None, "Must provide a checkpoint path!"
+    unet_path = osp.join(model_path, "unet", "diffusion_pytorch_model.bin")
+    vae_path = osp.join(model_path, "vae", "diffusion_pytorch_model.bin")
+    text_enc_path = osp.join(model_path, "text_encoder", "pytorch_model.bin")
+    # Convert the UNet model
+    unet_state_dict = torch.load(unet_path, map_location="cpu")
+    unet_state_dict = convert_unet_state_dict(unet_state_dict)
+    unet_state_dict = {
+        "model.diffusion_model." + k: v for k, v in unet_state_dict.items()
+    }
+    # Convert the VAE model
+    vae_state_dict = torch.load(vae_path, map_location="cpu")
+    vae_state_dict = convert_vae_state_dict(vae_state_dict)
+    vae_state_dict = {"first_stage_model." + k: v for k, v in vae_state_dict.items()}
+    # Convert the text encoder model
+    text_enc_dict = torch.load(text_enc_path, map_location="cpu")
+    text_enc_dict = convert_text_enc_state_dict(text_enc_dict)
+    text_enc_dict = {
+        "cond_stage_model.transformer." + k: v for k, v in text_enc_dict.items()
+    }
+    # Put together new checkpoint
+    state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict}
+    if as_half:
+        state_dict = {k: v.half() for k, v in state_dict.items()}
+    state_dict = {"state_dict": state_dict}
+    torch.save(state_dict, checkpoint_path)

lora_diffusion/utils.py ADDED Viewed

	@@ -0,0 +1,214 @@

+from typing import List, Union
+import torch
+from PIL import Image
+from transformers import (
+    CLIPProcessor,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+from diffusers import StableDiffusionPipeline
+from .lora import patch_pipe, tune_lora_scale, _text_lora_path, _ti_lora_path
+import os
+import glob
+import math
+EXAMPLE_PROMPTS = [
+    "<obj> swimming in a pool",
+    "<obj> at a beach with a view of seashore",
+    "<obj> in times square",
+    "<obj> wearing sunglasses",
+    "<obj> in a construction outfit",
+    "<obj> playing with a ball",
+    "<obj> wearing headphones",
+    "<obj> oil painting ghibli inspired",
+    "<obj> working on the laptop",
+    "<obj> with mountains and sunset in background",
+    "Painting of <obj> at a beach by artist claude monet",
+    "<obj> digital painting 3d render geometric style",
+    "A screaming <obj>",
+    "A depressed <obj>",
+    "A sleeping <obj>",
+    "A sad <obj>",
+    "A joyous <obj>",
+    "A frowning <obj>",
+    "A sculpture of <obj>",
+    "<obj> near a pool",
+    "<obj> at a beach with a view of seashore",
+    "<obj> in a garden",
+    "<obj> in grand canyon",
+    "<obj> floating in ocean",
+    "<obj> and an armchair",
+    "A maple tree on the side of <obj>",
+    "<obj> and an orange sofa",
+    "<obj> with chocolate cake on it",
+    "<obj> with a vase of rose flowers on it",
+    "A digital illustration of <obj>",
+    "Georgia O'Keeffe style <obj> painting",
+    "A watercolor painting of <obj> on a beach",
+]
+def image_grid(_imgs, rows=None, cols=None):
+    if rows is None and cols is None:
+        rows = cols = math.ceil(len(_imgs) ** 0.5)
+    if rows is None:
+        rows = math.ceil(len(_imgs) / cols)
+    if cols is None:
+        cols = math.ceil(len(_imgs) / rows)
+    w, h = _imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    grid_w, grid_h = grid.size
+    for i, img in enumerate(_imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+def text_img_alignment(img_embeds, text_embeds, target_img_embeds):
+    # evaluation inspired from textual inversion paper
+    # https://arxiv.org/abs/2208.01618
+    # text alignment
+    assert img_embeds.shape[0] == text_embeds.shape[0]
+    text_img_sim = (img_embeds * text_embeds).sum(dim=-1) / (
+        img_embeds.norm(dim=-1) * text_embeds.norm(dim=-1)
+    )
+    # image alignment
+    img_embed_normalized = img_embeds / img_embeds.norm(dim=-1, keepdim=True)
+    avg_target_img_embed = (
+        (target_img_embeds / target_img_embeds.norm(dim=-1, keepdim=True))
+        .mean(dim=0)
+        .unsqueeze(0)
+        .repeat(img_embeds.shape[0], 1)
+    )
+    img_img_sim = (img_embed_normalized * avg_target_img_embed).sum(dim=-1)
+    return {
+        "text_alignment_avg": text_img_sim.mean().item(),
+        "image_alignment_avg": img_img_sim.mean().item(),
+        "text_alignment_all": text_img_sim.tolist(),
+        "image_alignment_all": img_img_sim.tolist(),
+    }
+def prepare_clip_model_sets(eval_clip_id: str = "openai/clip-vit-large-patch14"):
+    text_model = CLIPTextModelWithProjection.from_pretrained(eval_clip_id)
+    tokenizer = CLIPTokenizer.from_pretrained(eval_clip_id)
+    vis_model = CLIPVisionModelWithProjection.from_pretrained(eval_clip_id)
+    processor = CLIPProcessor.from_pretrained(eval_clip_id)
+    return text_model, tokenizer, vis_model, processor
+def evaluate_pipe(
+    pipe,
+    target_images: List[Image.Image],
+    class_token: str = "",
+    learnt_token: str = "",
+    guidance_scale: float = 5.0,
+    seed=0,
+    clip_model_sets=None,
+    eval_clip_id: str = "openai/clip-vit-large-patch14",
+    n_test: int = 10,
+    n_step: int = 50,
+):
+    if clip_model_sets is not None:
+        text_model, tokenizer, vis_model, processor = clip_model_sets
+    else:
+        text_model, tokenizer, vis_model, processor = prepare_clip_model_sets(
+            eval_clip_id
+        )
+    images = []
+    img_embeds = []
+    text_embeds = []
+    for prompt in EXAMPLE_PROMPTS[:n_test]:
+        prompt = prompt.replace("<obj>", learnt_token)
+        torch.manual_seed(seed)
+        with torch.autocast("cuda"):
+            img = pipe(
+                prompt, num_inference_steps=n_step, guidance_scale=guidance_scale
+            ).images[0]
+        images.append(img)
+        # image
+        inputs = processor(images=img, return_tensors="pt")
+        img_embed = vis_model(**inputs).image_embeds
+        img_embeds.append(img_embed)
+        prompt = prompt.replace(learnt_token, class_token)
+        # prompts
+        inputs = tokenizer([prompt], padding=True, return_tensors="pt")
+        outputs = text_model(**inputs)
+        text_embed = outputs.text_embeds
+        text_embeds.append(text_embed)
+    # target images
+    inputs = processor(images=target_images, return_tensors="pt")
+    target_img_embeds = vis_model(**inputs).image_embeds
+    img_embeds = torch.cat(img_embeds, dim=0)
+    text_embeds = torch.cat(text_embeds, dim=0)
+    return text_img_alignment(img_embeds, text_embeds, target_img_embeds)
+def visualize_progress(
+    path_alls: Union[str, List[str]],
+    prompt: str,
+    model_id: str = "runwayml/stable-diffusion-v1-5",
+    device="cuda:0",
+    patch_unet=True,
+    patch_text=True,
+    patch_ti=True,
+    unet_scale=1.0,
+    text_sclae=1.0,
+    num_inference_steps=50,
+    guidance_scale=5.0,
+    offset: int = 0,
+    limit: int = 10,
+    seed: int = 0,
+):
+    imgs = []
+    if isinstance(path_alls, str):
+        alls = list(set(glob.glob(path_alls)))
+        alls.sort(key=os.path.getmtime)
+    else:
+        alls = path_alls
+    pipe = StableDiffusionPipeline.from_pretrained(
+        model_id, torch_dtype=torch.float16
+    ).to(device)
+    print(f"Found {len(alls)} checkpoints")
+    for path in alls[offset:limit]:
+        print(path)
+        patch_pipe(
+            pipe, path, patch_unet=patch_unet, patch_text=patch_text, patch_ti=patch_ti
+        )
+        tune_lora_scale(pipe.unet, unet_scale)
+        tune_lora_scale(pipe.text_encoder, text_sclae)
+        torch.manual_seed(seed)
+        image = pipe(
+            prompt,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+        ).images[0]
+        imgs.append(image)
+    return imgs

lora_diffusion/xformers_utils.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import functools
+import torch
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers.utils.import_utils import is_xformers_available
+from .lora import LoraInjectedLinear
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+@functools.cache
+def test_xformers_backwards(size):
+    @torch.enable_grad()
+    def _grad(size):
+        q = torch.randn((1, 4, size), device="cuda")
+        k = torch.randn((1, 4, size), device="cuda")
+        v = torch.randn((1, 4, size), device="cuda")
+        q = q.detach().requires_grad_()
+        k = k.detach().requires_grad_()
+        v = v.detach().requires_grad_()
+        out = xformers.ops.memory_efficient_attention(q, k, v)
+        loss = out.sum(2).mean(0).sum()
+        return torch.autograd.grad(loss, v)
+    try:
+        _grad(size)
+        print(size, "pass")
+        return True
+    except Exception as e:
+        print(size, "fail")
+        return False
+def set_use_memory_efficient_attention_xformers(
+    module: torch.nn.Module, valid: bool
+) -> None:
+    def fn_test_dim_head(module: torch.nn.Module):
+        if isinstance(module, BasicTransformerBlock):
+            # dim_head isn't stored anywhere, so back-calculate
+            source = module.attn1.to_v
+            if isinstance(source, LoraInjectedLinear):
+                source = source.linear
+            dim_head = source.out_features // module.attn1.heads
+            result = test_xformers_backwards(dim_head)
+            # If dim_head > dim_head_max, turn xformers off
+            if not result:
+                module.set_use_memory_efficient_attention_xformers(False)
+        for child in module.children():
+            fn_test_dim_head(child)
+    if not is_xformers_available() and valid:
+        print("XFormers is not available. Skipping.")
+        return
+    module.set_use_memory_efficient_attention_xformers(valid)
+    if valid:
+        fn_test_dim_head(module)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+diffusers
+accelerate
+transformers>=4.25.1

train_dreambooth_cloneofsimo_lora.py ADDED Viewed

	@@ -0,0 +1,1008 @@

+# Bootstrapped from:
+# https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py
+import argparse
+import hashlib
+import itertools
+import math
+import os
+import inspect
+from pathlib import Path
+from typing import Optional
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.optimization import get_scheduler
+from huggingface_hub import HfFolder, Repository, whoami
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+from lora_diffusion import (
+    extract_lora_ups_down,
+    inject_trainable_lora,
+    safetensors_available,
+    save_lora_weight,
+    save_safeloras,
+)
+from lora_diffusion.xformers_utils import set_use_memory_efficient_attention_xformers
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+from pathlib import Path
+import random
+import re
+class DreamBoothDataset(Dataset):
+    """
+    A dataset to prepare the instance and class images with the prompts for fine-tuning the model.
+    It pre-processes the images and the tokenizes prompts.
+    """
+    def __init__(
+        self,
+        instance_data_root,
+        instance_prompt,
+        tokenizer,
+        class_data_root=None,
+        class_prompt=None,
+        size=512,
+        center_crop=False,
+        color_jitter=False,
+        h_flip=False,
+        resize=False,
+    ):
+        self.size = size
+        self.center_crop = center_crop
+        self.tokenizer = tokenizer
+        self.resize = resize
+        self.instance_data_root = Path(instance_data_root)
+        if not self.instance_data_root.exists():
+            raise ValueError("Instance images root doesn't exists.")
+        self.instance_images_path = list(Path(instance_data_root).iterdir())
+        self.num_instance_images = len(self.instance_images_path)
+        self.instance_prompt = instance_prompt
+        self._length = self.num_instance_images
+        if class_data_root is not None:
+            self.class_data_root = Path(class_data_root)
+            self.class_data_root.mkdir(parents=True, exist_ok=True)
+            self.class_images_path = list(self.class_data_root.iterdir())
+            self.num_class_images = len(self.class_images_path)
+            self._length = max(self.num_class_images, self.num_instance_images)
+            self.class_prompt = class_prompt
+        else:
+            self.class_data_root = None
+        img_transforms = []
+        if resize:
+            img_transforms.append(
+                transforms.Resize(
+                    size, interpolation=transforms.InterpolationMode.BILINEAR
+                )
+            )
+        if center_crop:
+            img_transforms.append(transforms.CenterCrop(size))
+        if color_jitter:
+            img_transforms.append(transforms.ColorJitter(0.2, 0.1))
+        if h_flip:
+            img_transforms.append(transforms.RandomHorizontalFlip())
+        self.image_transforms = transforms.Compose(
+            [*img_transforms, transforms.ToTensor(), transforms.Normalize([0.5], [0.5])]
+        )
+    def __len__(self):
+        return self._length
+    def __getitem__(self, index):
+        example = {}
+        instance_image = Image.open(
+            self.instance_images_path[index % self.num_instance_images]
+        )
+        if not instance_image.mode == "RGB":
+            instance_image = instance_image.convert("RGB")
+        example["instance_images"] = self.image_transforms(instance_image)
+        example["instance_prompt_ids"] = self.tokenizer(
+            self.instance_prompt,
+            padding="do_not_pad",
+            truncation=True,
+            max_length=self.tokenizer.model_max_length,
+        ).input_ids
+        if self.class_data_root:
+            class_image = Image.open(
+                self.class_images_path[index % self.num_class_images]
+            )
+            if not class_image.mode == "RGB":
+                class_image = class_image.convert("RGB")
+            example["class_images"] = self.image_transforms(class_image)
+            example["class_prompt_ids"] = self.tokenizer(
+                self.class_prompt,
+                padding="do_not_pad",
+                truncation=True,
+                max_length=self.tokenizer.model_max_length,
+            ).input_ids
+        return example
+class PromptDataset(Dataset):
+    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    def __init__(self, prompt, num_samples):
+        self.prompt = prompt
+        self.num_samples = num_samples
+    def __len__(self):
+        return self.num_samples
+    def __getitem__(self, index):
+        example = {}
+        example["prompt"] = self.prompt
+        example["index"] = index
+        return example
+logger = get_logger(__name__)
+def parse_args(input_args=None):
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--pretrained_vae_name_or_path",
+        type=str,
+        default=None,
+        help="Path to pretrained vae or vae identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        type=str,
+        default=None,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--instance_data_dir",
+        type=str,
+        default=None,
+        required=True,
+        help="A folder containing the training data of instance images.",
+    )
+    parser.add_argument(
+        "--class_data_dir",
+        type=str,
+        default=None,
+        required=False,
+        help="A folder containing the training data of class images.",
+    )
+    parser.add_argument(
+        "--instance_prompt",
+        type=str,
+        default=None,
+        required=True,
+        help="The prompt with identifier specifying the instance",
+    )
+    parser.add_argument(
+        "--class_prompt",
+        type=str,
+        default=None,
+        help="The prompt to specify images in the same class as provided instance images.",
+    )
+    parser.add_argument(
+        "--with_prior_preservation",
+        default=False,
+        action="store_true",
+        help="Flag to add prior preservation loss.",
+    )
+    parser.add_argument(
+        "--prior_loss_weight",
+        type=float,
+        default=1.0,
+        help="The weight of prior preservation loss.",
+    )
+    parser.add_argument(
+        "--num_class_images",
+        type=int,
+        default=100,
+        help=(
+            "Minimal class images for prior preservation loss. If not have enough images, additional images will be"
+            " sampled with class_prompt."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="text-inversion-model",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--output_format",
+        type=str,
+        choices=["pt", "safe", "both"],
+        default="both",
+        help="The output format of the model predicitions and checkpoints.",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=None, help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=512,
+        help=(
+            "The resolution for input images, all the images in the train/validation dataset will be resized to this"
+            " resolution"
+        ),
+    )
+    parser.add_argument(
+        "--center_crop",
+        action="store_true",
+        help="Whether to center crop images before resizing to resolution",
+    )
+    parser.add_argument(
+        "--color_jitter",
+        action="store_true",
+        help="Whether to apply color jitter to images",
+    )
+    parser.add_argument(
+        "--train_text_encoder",
+        action="store_true",
+        help="Whether to train the text encoder",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=4,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--sample_batch_size",
+        type=int,
+        default=4,
+        help="Batch size (per device) for sampling images.",
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform.  If provided, overrides num_train_epochs.",
+    )
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save checkpoint every X updates steps.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--lora_rank",
+        type=int,
+        default=4,
+        help="Rank of LoRA approximation.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=None,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--learning_rate_text",
+        type=float,
+        default=5e-6,
+        help="Initial learning rate for text encoder (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps",
+        type=int,
+        default=500,
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes.",
+    )
+    parser.add_argument(
+        "--adam_beta1",
+        type=float,
+        default=0.9,
+        help="The beta1 parameter for the Adam optimizer.",
+    )
+    parser.add_argument(
+        "--adam_beta2",
+        type=float,
+        default=0.999,
+        help="The beta2 parameter for the Adam optimizer.",
+    )
+    parser.add_argument(
+        "--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer",
+    )
+    parser.add_argument(
+        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether or not to push the model to the Hub.",
+    )
+    parser.add_argument(
+        "--hub_token",
+        type=str,
+        default=None,
+        help="The token to use to push to the Model Hub.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help=(
+            "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
+            " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***."
+        ),
+    )
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--local_rank",
+        type=int,
+        default=-1,
+        help="For distributed training: local_rank",
+    )
+    parser.add_argument(
+        "--resume_unet",
+        type=str,
+        default=None,
+        help=("File path for unet lora to resume training."),
+    )
+    parser.add_argument(
+        "--resume_text_encoder",
+        type=str,
+        default=None,
+        help=("File path for text encoder lora to resume training."),
+    )
+    parser.add_argument(
+        "--resize",
+        type=bool,
+        default=True,
+        required=False,
+        help="Should images be resized to --resolution before training?",
+    )
+    parser.add_argument(
+        "--use_xformers", action="store_true", help="Whether or not to use xformers"
+    )
+    if input_args is not None:
+        args = parser.parse_args(input_args)
+    else:
+        args = parser.parse_args()
+    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
+    if env_local_rank != -1 and env_local_rank != args.local_rank:
+        args.local_rank = env_local_rank
+    if args.with_prior_preservation:
+        if args.class_data_dir is None:
+            raise ValueError("You must specify a data directory for class images.")
+        if args.class_prompt is None:
+            raise ValueError("You must specify prompt for class images.")
+    else:
+        if args.class_data_dir is not None:
+            logger.warning(
+                "You need not use --class_data_dir without --with_prior_preservation."
+            )
+        if args.class_prompt is not None:
+            logger.warning(
+                "You need not use --class_prompt without --with_prior_preservation."
+            )
+    if not safetensors_available:
+        if args.output_format == "both":
+            print(
+                "Safetensors is not available - changing output format to just output PyTorch files"
+            )
+            args.output_format = "pt"
+        elif args.output_format == "safe":
+            raise ValueError(
+                "Safetensors is not available - either install it, or change output_format."
+            )
+    return args
+def main(args):
+    logging_dir = Path(args.output_dir, args.logging_dir)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with="tensorboard",
+        logging_dir=logging_dir,
+    )
+    # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate
+    # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models.
+    # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate.
+    if (
+        args.train_text_encoder
+        and args.gradient_accumulation_steps > 1
+        and accelerator.num_processes > 1
+    ):
+        raise ValueError(
+            "Gradient accumulation is not supported when training the text encoder in distributed training. "
+            "Please set gradient_accumulation_steps to 1. This feature will be supported in the future."
+        )
+    if args.seed is not None:
+        set_seed(args.seed)
+    if args.with_prior_preservation:
+        class_images_dir = Path(args.class_data_dir)
+        if not class_images_dir.exists():
+            class_images_dir.mkdir(parents=True)
+        cur_class_images = len(list(class_images_dir.iterdir()))
+        if cur_class_images < args.num_class_images:
+            torch_dtype = (
+                torch.float16 if accelerator.device.type == "cuda" else torch.float32
+            )
+            pipeline = StableDiffusionPipeline.from_pretrained(
+                args.pretrained_model_name_or_path,
+                torch_dtype=torch_dtype,
+                safety_checker=None,
+                revision=args.revision,
+            )
+            pipeline.set_progress_bar_config(disable=True)
+            num_new_images = args.num_class_images - cur_class_images
+            logger.info(f"Number of class images to sample: {num_new_images}.")
+            sample_dataset = PromptDataset(args.class_prompt, num_new_images)
+            sample_dataloader = torch.utils.data.DataLoader(
+                sample_dataset, batch_size=args.sample_batch_size
+            )
+            sample_dataloader = accelerator.prepare(sample_dataloader)
+            pipeline.to(accelerator.device)
+            for example in tqdm(
+                sample_dataloader,
+                desc="Generating class images",
+                disable=not accelerator.is_local_main_process,
+            ):
+                images = pipeline(example["prompt"]).images
+                for i, image in enumerate(images):
+                    hash_image = hashlib.sha1(image.tobytes()).hexdigest()
+                    image_filename = (
+                        class_images_dir
+                        / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg"
+                    )
+                    image.save(image_filename)
+            del pipeline
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+    # Load the tokenizer
+    if args.tokenizer_name:
+        tokenizer = CLIPTokenizer.from_pretrained(
+            args.tokenizer_name,
+            revision=args.revision,
+        )
+    elif args.pretrained_model_name_or_path:
+        tokenizer = CLIPTokenizer.from_pretrained(
+            args.pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            revision=args.revision,
+        )
+    # Load models and create wrapper for stable diffusion
+    text_encoder = CLIPTextModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="text_encoder",
+        revision=args.revision,
+    )
+    vae = AutoencoderKL.from_pretrained(
+        args.pretrained_vae_name_or_path or args.pretrained_model_name_or_path,
+        subfolder=None if args.pretrained_vae_name_or_path else "vae",
+        revision=None if args.pretrained_vae_name_or_path else args.revision,
+    )
+    unet = UNet2DConditionModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="unet",
+        revision=args.revision,
+    )
+    unet.requires_grad_(False)
+    unet_lora_params, _ = inject_trainable_lora(
+        unet, r=args.lora_rank, loras=args.resume_unet
+    )
+    for _up, _down in extract_lora_ups_down(unet):
+        print("Before training: Unet First Layer lora up", _up.weight.data)
+        print("Before training: Unet First Layer lora down", _down.weight.data)
+        break
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    if args.train_text_encoder:
+        text_encoder_lora_params, _ = inject_trainable_lora(
+            text_encoder,
+            target_replace_module=["CLIPAttention"],
+            r=args.lora_rank,
+        )
+        for _up, _down in extract_lora_ups_down(
+            text_encoder, target_replace_module=["CLIPAttention"]
+        ):
+            print("Before training: text encoder First Layer lora up", _up.weight.data)
+            print(
+                "Before training: text encoder First Layer lora down", _down.weight.data
+            )
+            break
+    if args.use_xformers:
+        set_use_memory_efficient_attention_xformers(unet, True)
+        set_use_memory_efficient_attention_xformers(vae, True)
+    if args.gradient_checkpointing:
+        unet.enable_gradient_checkpointing()
+        if args.train_text_encoder:
+            text_encoder.gradient_checkpointing_enable()
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate
+            * args.gradient_accumulation_steps
+            * args.train_batch_size
+            * accelerator.num_processes
+        )
+    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+    text_lr = (
+        args.learning_rate
+        if args.learning_rate_text is None
+        else args.learning_rate_text
+    )
+    params_to_optimize = (
+        [
+            {"params": itertools.chain(*unet_lora_params), "lr": args.learning_rate},
+            {
+                "params": itertools.chain(*text_encoder_lora_params),
+                "lr": text_lr,
+            },
+        ]
+        if args.train_text_encoder
+        else itertools.chain(*unet_lora_params)
+    )
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=args.learning_rate,
+        betas=(args.adam_beta1, args.adam_beta2),
+        weight_decay=args.adam_weight_decay,
+        eps=args.adam_epsilon,
+    )
+    noise_scheduler = DDPMScheduler.from_config(
+        args.pretrained_model_name_or_path, subfolder="scheduler"
+    )
+    train_dataset = DreamBoothDataset(
+        instance_data_root=args.instance_data_dir,
+        instance_prompt=args.instance_prompt,
+        class_data_root=args.class_data_dir if args.with_prior_preservation else None,
+        class_prompt=args.class_prompt,
+        tokenizer=tokenizer,
+        size=args.resolution,
+        center_crop=args.center_crop,
+        color_jitter=args.color_jitter,
+        resize=args.resize,
+    )
+    def collate_fn(examples):
+        input_ids = [example["instance_prompt_ids"] for example in examples]
+        pixel_values = [example["instance_images"] for example in examples]
+        # Concat class and instance examples for prior preservation.
+        # We do this to avoid doing two forward passes.
+        if args.with_prior_preservation:
+            input_ids += [example["class_prompt_ids"] for example in examples]
+            pixel_values += [example["class_images"] for example in examples]
+        pixel_values = torch.stack(pixel_values)
+        pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
+        input_ids = tokenizer.pad(
+            {"input_ids": input_ids},
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+        batch = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+        }
+        return batch
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=1,
+    )
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+    lr_scheduler = get_scheduler(
+        args.lr_scheduler,
+        optimizer=optimizer,
+        num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
+        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
+    )
+    if args.train_text_encoder:
+        (
+            unet,
+            text_encoder,
+            optimizer,
+            train_dataloader,
+            lr_scheduler,
+        ) = accelerator.prepare(
+            unet, text_encoder, optimizer, train_dataloader, lr_scheduler
+        )
+    else:
+        unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+            unet, optimizer, train_dataloader, lr_scheduler
+        )
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    # Move text_encode and vae to gpu.
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    vae.to(accelerator.device, dtype=weight_dtype)
+    if not args.train_text_encoder:
+        text_encoder.to(accelerator.device, dtype=weight_dtype)
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("dreambooth", config=vars(args))
+    # Train!
+    total_batch_size = (
+        args.train_batch_size
+        * accelerator.num_processes
+        * args.gradient_accumulation_steps
+    )
+    print("***** Running training *****")
+    print(f"  Num examples = {len(train_dataset)}")
+    print(f"  Num batches each epoch = {len(train_dataloader)}")
+    print(f"  Num Epochs = {args.num_train_epochs}")
+    print(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    print(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+    )
+    print(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
+    print(f"  Total optimization steps = {args.max_train_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(
+        range(args.max_train_steps), disable=not accelerator.is_local_main_process
+    )
+    progress_bar.set_description("Steps")
+    global_step = 0
+    last_save = 0
+    for epoch in range(args.num_train_epochs):
+        unet.train()
+        if args.train_text_encoder:
+            text_encoder.train()
+        for step, batch in enumerate(train_dataloader):
+            # Convert images to latent space
+            latents = vae.encode(
+                batch["pixel_values"].to(dtype=weight_dtype)
+            ).latent_dist.sample()
+            latents = latents * 0.18215
+            # Sample noise that we'll add to the latents
+            noise = torch.randn_like(latents)
+            bsz = latents.shape[0]
+            # Sample a random timestep for each image
+            timesteps = torch.randint(
+                0,
+                noise_scheduler.config.num_train_timesteps,
+                (bsz,),
+                device=latents.device,
+            )
+            timesteps = timesteps.long()
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process)
+            noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
+            # Get the text embedding for conditioning
+            encoder_hidden_states = text_encoder(batch["input_ids"])[0]
+            # Predict the noise residual
+            model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+            # Get the target for loss depending on the prediction type
+            if noise_scheduler.config.prediction_type == "epsilon":
+                target = noise
+            elif noise_scheduler.config.prediction_type == "v_prediction":
+                target = noise_scheduler.get_velocity(latents, noise, timesteps)
+            else:
+                raise ValueError(
+                    f"Unknown prediction type {noise_scheduler.config.prediction_type}"
+                )
+            if args.with_prior_preservation:
+                # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
+                model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
+                target, target_prior = torch.chunk(target, 2, dim=0)
+                # Compute instance loss
+                loss = (
+                    F.mse_loss(model_pred.float(), target.float(), reduction="none")
+                    .mean([1, 2, 3])
+                    .mean()
+                )
+                # Compute prior loss
+                prior_loss = F.mse_loss(
+                    model_pred_prior.float(), target_prior.float(), reduction="mean"
+                )
+                # Add the prior loss to the instance loss.
+                loss = loss + args.prior_loss_weight * prior_loss
+            else:
+                loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
+            accelerator.backward(loss)
+            if accelerator.sync_gradients:
+                params_to_clip = (
+                    itertools.chain(unet.parameters(), text_encoder.parameters())
+                    if args.train_text_encoder
+                    else unet.parameters()
+                )
+                accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+            optimizer.step()
+            lr_scheduler.step()
+            progress_bar.update(1)
+            optimizer.zero_grad()
+            global_step += 1
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                if args.save_steps and global_step - last_save >= args.save_steps:
+                    if accelerator.is_main_process:
+                        # newer versions of accelerate allow the 'keep_fp32_wrapper' arg. without passing
+                        # it, the models will be unwrapped, and when they are then used for further training,
+                        # we will crash. pass this, but only to newer versions of accelerate. fixes
+                        # https://github.com/huggingface/diffusers/issues/1566
+                        accepts_keep_fp32_wrapper = "keep_fp32_wrapper" in set(
+                            inspect.signature(
+                                accelerator.unwrap_model
+                            ).parameters.keys()
+                        )
+                        extra_args = (
+                            {"keep_fp32_wrapper": True}
+                            if accepts_keep_fp32_wrapper
+                            else {}
+                        )
+                        pipeline = StableDiffusionPipeline.from_pretrained(
+                            args.pretrained_model_name_or_path,
+                            unet=accelerator.unwrap_model(unet, **extra_args),
+                            text_encoder=accelerator.unwrap_model(
+                                text_encoder, **extra_args
+                            ),
+                            revision=args.revision,
+                        )
+                        filename_unet = (
+                            f"{args.output_dir}/lora_weight_e{epoch}_s{global_step}.pt"
+                        )
+                        filename_text_encoder = f"{args.output_dir}/lora_weight_e{epoch}_s{global_step}.text_encoder.pt"
+                        print(f"save weights {filename_unet}, {filename_text_encoder}")
+                        save_lora_weight(pipeline.unet, filename_unet)
+                        if args.train_text_encoder:
+                            save_lora_weight(
+                                pipeline.text_encoder,
+                                filename_text_encoder,
+                                target_replace_module=["CLIPAttention"],
+                            )
+                        for _up, _down in extract_lora_ups_down(pipeline.unet):
+                            print(
+                                "First Unet Layer's Up Weight is now : ",
+                                _up.weight.data,
+                            )
+                            print(
+                                "First Unet Layer's Down Weight is now : ",
+                                _down.weight.data,
+                            )
+                            break
+                        if args.train_text_encoder:
+                            for _up, _down in extract_lora_ups_down(
+                                pipeline.text_encoder,
+                                target_replace_module=["CLIPAttention"],
+                            ):
+                                print(
+                                    "First Text Encoder Layer's Up Weight is now : ",
+                                    _up.weight.data,
+                                )
+                                print(
+                                    "First Text Encoder Layer's Down Weight is now : ",
+                                    _down.weight.data,
+                                )
+                                break
+                        last_save = global_step
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            if global_step >= args.max_train_steps:
+                break
+    accelerator.wait_for_everyone()
+    # Create the pipeline using using the trained modules and save it.
+    if accelerator.is_main_process:
+        pipeline = StableDiffusionPipeline.from_pretrained(
+            args.pretrained_model_name_or_path,
+            unet=accelerator.unwrap_model(unet),
+            text_encoder=accelerator.unwrap_model(text_encoder),
+            revision=args.revision,
+        )
+        print("\n\nLora TRAINING DONE!\n\n")
+        if args.output_format == "pt" or args.output_format == "both":
+            save_lora_weight(pipeline.unet, args.output_dir + "/lora_weight.pt")
+            if args.train_text_encoder:
+                save_lora_weight(
+                    pipeline.text_encoder,
+                    args.output_dir + "/lora_weight.text_encoder.pt",
+                    target_replace_module=["CLIPAttention"],
+                )
+        if args.output_format == "safe" or args.output_format == "both":
+            loras = {}
+            loras["unet"] = (pipeline.unet, {"CrossAttention", "Attention", "GEGLU"})
+            if args.train_text_encoder:
+                loras["text_encoder"] = (pipeline.text_encoder, {"CLIPAttention"})
+            save_safeloras(loras, args.output_dir + "/lora_weight.safetensors")
+        if args.push_to_hub:
+            repo.push_to_hub(
+                commit_message="End of training",
+                blocking=False,
+                auto_lfs_prune=True,
+            )
+    accelerator.end_training()
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)