Spaces:

rhfeiyang
/

Art-Free-Diffusion

Running on Zero

App Files Files Community

rhfeiyang commited on 4 days ago

Commit

262b155

•

1 Parent(s): d02b40b

Upload folder using huggingface_hub

Browse files

Files changed (50) hide show

.ipynb_checkpoints/hf_demo_test-checkpoint.ipynb +336 -0
README.md +3 -10
__pycache__/inference.cpython-39.pyc +0 -0
custom_datasets/__init__.py +141 -0
custom_datasets/__pycache__/__init__.cpython-39.pyc +0 -0
custom_datasets/__pycache__/coco.cpython-39.pyc +0 -0
custom_datasets/__pycache__/imagepair.cpython-39.pyc +0 -0
custom_datasets/__pycache__/mypath.cpython-39.pyc +0 -0
custom_datasets/coco.py +307 -0
custom_datasets/custom_caption.py +113 -0
custom_datasets/filt/coco/filt.py +186 -0
custom_datasets/filt/sam_filt.py +299 -0
custom_datasets/imagepair.py +240 -0
custom_datasets/lhq.py +127 -0
custom_datasets/mypath.py +29 -0
custom_datasets/sam.py +160 -0
data/Art_adapters/albert-gleizes_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/andre-derain_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/andy_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/camille-corot_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/gerhard-richter_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/henri-matisse_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/jackson-pollock_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/joan-miro_subset2/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/kandinsky_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/katsushika-hokusai_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/klimt_subset3/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/m.c.-escher_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/monet_subset2/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/picasso_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/roy-lichtenstein_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/van_gogh_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/Art_adapters/walter-battiss_subset2/adapter_alpha1.0_rank1_all_up_1000steps.pt +3 -0
data/unsafe.png +0 -0
hf_demo.py +147 -0
hf_demo_test.ipynb +336 -0
inference.py +657 -0
utils/__init__.py +1 -0
utils/__pycache__/__init__.cpython-39.pyc +0 -0
utils/__pycache__/lora.cpython-39.pyc +0 -0
utils/__pycache__/metrics.cpython-39.pyc +0 -0
utils/__pycache__/train_util.cpython-39.pyc +0 -0
utils/art_filter.py +210 -0
utils/config_util.py +105 -0
utils/debug_util.py +16 -0
utils/lora.py +282 -0
utils/metrics.py +577 -0
utils/model_util.py +291 -0
utils/prompt_util.py +174 -0
utils/train_util.py +526 -0

.ipynb_checkpoints/hf_demo_test-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,336 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-09T09:44:30.641366Z",
+     "start_time": "2024-12-09T09:44:11.789050Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import gradio as gr\n",
+    "from diffusers import DiffusionPipeline\n",
+    "import matplotlib.pyplot as plt\n",
+    "import torch\n",
+    "from PIL import Image\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ddf33e0d3abacc2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "#append current path\n",
+    "sys.path.extend(\"/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/release/hf_demo\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "643e49fd601daf8f",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-09T09:44:35.790962Z",
+     "start_time": "2024-12-09T09:44:35.779496Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e03aae2a4e5676dd",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-09T09:44:44.157412Z",
+     "start_time": "2024-12-09T09:44:37.138452Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/vision/torralba/selfmanaged/torralba/scratch/jomat/sam_dataset/miniforge3/envs/diffusion/lib/python3.9/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9df8347307674ba8afb0250e23109aa1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "pipe = DiffusionPipeline.from_pretrained(\"rhfeiyang/art-free-diffusion-v1\",).to(\"cuda\")\n",
+    "device = \"cuda\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "83916bc68ff5d914",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-09T09:44:52.694399Z",
+     "start_time": "2024-12-09T09:44:44.210695Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from inference import get_lora_network, inference, get_validation_dataloader\n",
+    "lora_map = {\n",
+    "    \"None\": \"None\",\n",
+    "    \"Andre Derain\": \"andre-derain_subset1\",\n",
+    "    \"Vincent van Gogh\": \"van_gogh_subset1\",\n",
+    "    \"Andy Warhol\": \"andy_subset1\",\n",
+    "    \"Walter Battiss\": \"walter-battiss_subset2\",\n",
+    "    \"Camille Corot\": \"camille-corot_subset1\",\n",
+    "    \"Claude Monet\": \"monet_subset2\",\n",
+    "    \"Pablo Picasso\": \"picasso_subset1\",\n",
+    "    \"Jackson Pollock\": \"jackson-pollock_subset1\",\n",
+    "    \"Gerhard Richter\": \"gerhard-richter_subset1\",\n",
+    "    \"M.C. Escher\": \"m.c.-escher_subset1\",\n",
+    "    \"Albert Gleizes\": \"albert-gleizes_subset1\",\n",
+    "    \"Hokusai\": \"katsushika-hokusai_subset1\",\n",
+    "    \"Wassily Kandinsky\": \"kandinsky_subset1\",\n",
+    "    \"Gustav Klimt\": \"klimt_subset3\",\n",
+    "    \"Roy Lichtenstein\": \"roy-lichtenstein_subset1\",\n",
+    "    \"Henri Matisse\": \"henri-matisse_subset1\",\n",
+    "    \"Joan Miro\": \"joan-miro_subset2\",\n",
+    "}\n",
+    "\n",
+    "def demo_inference_gen(adapter_choice:str, prompt:str, samples:int=1,seed:int=0, steps=50, guidance_scale=7.5):\n",
+    "    adapter_path = lora_map[adapter_choice]\n",
+    "    if adapter_path not in [None, \"None\"]:\n",
+    "        adapter_path = f\"data/Art_adapters/{adapter_path}/adapter_alpha1.0_rank1_all_up_1000steps.pt\"\n",
+    "\n",
+    "    prompts = [prompt]*samples\n",
+    "    infer_loader = get_validation_dataloader(prompts)\n",
+    "    network = get_lora_network(pipe.unet, adapter_path)[\"network\"]\n",
+    "    pred_images = inference(network, pipe.tokenizer, pipe.text_encoder, pipe.vae, pipe.unet, pipe.scheduler, infer_loader,\n",
+    "                            height=512, width=512, scales=[1.0],\n",
+    "                            save_dir=None, seed=seed,steps=steps, guidance_scale=guidance_scale,\n",
+    "                            start_noise=-1, show=False, style_prompt=\"sks art\", no_load=True,\n",
+    "                            from_scratch=True)[0][1.0]\n",
+    "    return pred_images\n",
+    "\n",
+    "def demo_inference_stylization(adapter_path:str, prompts:list, image:list, start_noise=800,seed:int=0):\n",
+    "    infer_loader = get_validation_dataloader(prompts, image)\n",
+    "    network = get_lora_network(pipe.unet, adapter_path,\"all_up\")[\"network\"]\n",
+    "    pred_images = inference(network, pipe.tokenizer, pipe.text_encoder, pipe.vae, pipe.unet, pipe.scheduler, infer_loader,\n",
+    "                            height=512, width=512, scales=[0.,1.],\n",
+    "                            save_dir=None, seed=seed,steps=20, guidance_scale=7.5,\n",
+    "                            start_noise=start_noise, show=True, style_prompt=\"sks art\", no_load=True,\n",
+    "                            from_scratch=False)\n",
+    "    return pred_images\n",
+    "\n",
+    "# def infer(prompt, samples, steps, scale, seed):\n",
+    "#     generator = torch.Generator(device=device).manual_seed(seed)\n",
+    "#     images_list = pipe(  # type: ignore\n",
+    "#         [prompt] * samples,\n",
+    "#         num_inference_steps=steps,\n",
+    "#         guidance_scale=scale,\n",
+    "#         generator=generator,\n",
+    "#     )\n",
+    "#     images = []\n",
+    "#     safe_image = Image.open(r\"data/unsafe.png\")\n",
+    "#     print(images_list)\n",
+    "#     for i, image in enumerate(images_list[\"images\"]):  # type: ignore\n",
+    "#         if images_list[\"nsfw_content_detected\"][i]:  # type: ignore\n",
+    "#             images.append(safe_image)\n",
+    "#         else:\n",
+    "#             images.append(image)\n",
+    "#     return images\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "aa33e9d104023847",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-09T12:09:39.339583Z",
+     "start_time": "2024-12-09T12:09:38.953936Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<gradio.components.slider.Slider object at 0x7fa12d3a5280>\n",
+      "Running on local URL:  http://127.0.0.1:7876\n",
+      "Running on public URL: https://be7cce8fec75395c82.gradio.live\n",
+      "\n",
+      "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"https://be7cce8fec75395c82.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train method: None\n",
+      "Rank: 1, Alpha: 1\n",
+      "create LoRA for U-Net: 0 modules.\n",
+      "save dir: None\n",
+      "['Park with cherry blossom trees, picnicker’s and a clear blue pond in the style of sks art'], seed=949192390\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/vision/torralba/selfmanaged/torralba/scratch/jomat/sam_dataset/miniforge3/envs/diffusion/lib/python3.9/site-packages/torch/nn/modules/conv.py:456: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /opt/conda/conda-bld/pytorch_1712608883701/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)\n",
+      "  return F.conv2d(input, weight, bias, self.stride,\n",
+      "\n",
+      "00%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:03<00:00,  6.90it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time taken for one batch, Art Adapter scale=1.0: 3.2747044563293457\n"
+     ]
+    }
+   ],
+   "source": [
+    "block = gr.Blocks()\n",
+    "# Direct infer\n",
+    "with block:\n",
+    "    with gr.Group():\n",
+    "        with gr.Row():\n",
+    "            text = gr.Textbox(\n",
+    "                label=\"Enter your prompt\",\n",
+    "                max_lines=2,\n",
+    "                placeholder=\"Enter your prompt\",\n",
+    "                container=False,\n",
+    "                value=\"Park with cherry blossom trees, picnicker’s and a clear blue pond.\",\n",
+    "            )\n",
+    "            \n",
+    "\n",
+    "            \n",
+    "            btn = gr.Button(\"Run\", scale=0)\n",
+    "        gallery = gr.Gallery(\n",
+    "            label=\"Generated images\",\n",
+    "            show_label=False,\n",
+    "            elem_id=\"gallery\",\n",
+    "            columns=[2],\n",
+    "        )\n",
+    "\n",
+    "        advanced_button = gr.Button(\"Advanced options\", elem_id=\"advanced-btn\")\n",
+    "\n",
+    "        with gr.Row(elem_id=\"advanced-options\"):\n",
+    "            adapter_choice = gr.Dropdown(\n",
+    "                label=\"Choose adapter\",\n",
+    "                choices=[\"None\", \"Andre Derain\",\"Vincent van Gogh\",\"Andy Warhol\", \"Walter Battiss\",\n",
+    "                         \"Camille Corot\", \"Claude Monet\", \"Pablo Picasso\",\n",
+    "                         \"Jackson Pollock\", \"Gerhard Richter\", \"M.C. Escher\",\n",
+    "                         \"Albert Gleizes\", \"Hokusai\", \"Wassily Kandinsky\", \"Gustav Klimt\", \"Roy Lichtenstein\",\n",
+    "                         \"Henri Matisse\", \"Joan Miro\"\n",
+    "                         ],\n",
+    "                value=\"None\"\n",
+    "            )\n",
+    "            # print(adapter_choice[0])\n",
+    "            # lora_path = lora_map[adapter_choice.value]\n",
+    "            # if lora_path is not None:\n",
+    "            #     lora_path = f\"data/Art_adapters/{lora_path}/adapter_alpha1.0_rank1_all_up_1000steps.pt\"\n",
+    "\n",
+    "            samples = gr.Slider(label=\"Images\", minimum=1, maximum=4, value=1, step=1)\n",
+    "            steps = gr.Slider(label=\"Steps\", minimum=1, maximum=50, value=20, step=1)\n",
+    "            scale = gr.Slider(\n",
+    "                label=\"Guidance Scale\", minimum=0, maximum=50, value=7.5, step=0.1\n",
+    "            )\n",
+    "            print(scale)\n",
+    "            seed = gr.Slider(\n",
+    "                label=\"Seed\",\n",
+    "                minimum=0,\n",
+    "                maximum=2147483647,\n",
+    "                step=1,\n",
+    "                randomize=True,\n",
+    "            )\n",
+    "\n",
+    "        gr.on([text.submit, btn.click], demo_inference_gen, inputs=[adapter_choice, text, samples, seed, steps, scale], outputs=gallery)\n",
+    "        advanced_button.click(\n",
+    "            None,\n",
+    "            [],\n",
+    "            text,\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "block.launch(share=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3239c12167a5f2cd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

README.md CHANGED Viewed

@@ -1,13 +1,6 @@
 ---
-title: Art Free Diffusion
-emoji: 🖼
-colorFrom: purple
-colorTo: red
 sdk: gradio
-sdk_version: 5.0.1
-app_file: app.py
-pinned: false
-short_description: Demo for Art Free Diffusion
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Art-Free-Diffusion
+app_file: hf_demo.py
 sdk: gradio
+sdk_version: 4.44.1
 ---

__pycache__/inference.cpython-39.pyc ADDED Viewed

Binary file (19.8 kB). View file

custom_datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,141 @@

+from .mypath import MyPath
+from copy import deepcopy
+from datasets import load_dataset
+from torch.utils.data import Dataset
+import numpy as np
+def get_dataset(dataset_name, transformation=None , train_subsample:int =None, val_subsample:int = 10000, get_val=True):
+    if train_subsample is not None and train_subsample<val_subsample and train_subsample!=-1:
+        print(f"Warning: train_subsample is smaller than val_subsample. val_subsample will be set to train_subsample: {train_subsample}")
+        val_subsample = train_subsample
+    if dataset_name == "imagenet":
+        from .imagenet import Imagenet1k
+        train_set = Imagenet1k(data_dir = MyPath.db_root_dir(dataset_name), transform = transformation, split="train", prompt_transform=Label_prompt_transform(real=True))
+    elif dataset_name == "coco_train":
+        # raise NotImplementedError("Use coco_filtered instead")
+        from .coco import CocoCaptions
+        train_set = CocoCaptions(root=MyPath.db_root_dir("coco_train"), annFile=MyPath.db_root_dir("coco_caption_train"))
+    elif dataset_name == "coco_val":
+        from .coco import CocoCaptions
+        train_set = CocoCaptions(root=MyPath.db_root_dir("coco_val"), annFile=MyPath.db_root_dir("coco_caption_val"))
+        return {"val": train_set}
+    elif dataset_name == "coco_clip_filtered":
+        from .coco import CocoCaptions_clip_filtered
+        train_set = CocoCaptions_clip_filtered(root=MyPath.db_root_dir("coco_train"), annFile=MyPath.db_root_dir("coco_caption_train"))
+    elif dataset_name == "coco_filtered_sub100":
+        from .coco import CocoCaptions_clip_filtered
+        train_set = CocoCaptions_clip_filtered(root=MyPath.db_root_dir("coco_train"), annFile=MyPath.db_root_dir("coco_caption_train"), id_file=MyPath.db_root_dir("coco_clip_filtered_ids_sub100"),)
+    elif dataset_name == "cifar10":
+        from .cifar import CIFAR10
+        train_set = CIFAR10(root=MyPath.db_root_dir("cifar10"), train=True, transform=transformation, prompt_transform=Label_prompt_transform(real=True))
+    elif dataset_name == "cifar100":
+        from .cifar import CIFAR100
+        train_set = CIFAR100(root=MyPath.db_root_dir("cifar100"), train=True, transform=transformation, prompt_transform=Label_prompt_transform(real=True))
+    elif "wikiart" in dataset_name and "/" not in dataset_name:
+        from .wikiart.wikiart import Wikiart_caption
+        dataset = Wikiart_caption(data_path=MyPath.db_root_dir(dataset_name))
+        return {"train": dataset.subsample(train_subsample).get_dataset(), "val": deepcopy(dataset).subsample(val_subsample).get_dataset() if get_val else None}
+    elif "imagepair" in dataset_name:
+        from .imagepair import ImagePair
+        train_set = ImagePair(folder1=MyPath.db_root_dir(dataset_name)[0], folder2=MyPath.db_root_dir(dataset_name)[1], transform=transformation).subsample(train_subsample)
+    # elif dataset_name == "sam_clip_filtered":
+    #     from .sam import SamDataset
+    #     train_set = SamDataset(image_folder_path=MyPath.db_root_dir("sam_images"), caption_folder_path=MyPath.db_root_dir("sam_captions"), id_file=MyPath.db_root_dir("sam_ids"), transforms=transformation).subsample(train_subsample)
+    elif dataset_name == "sam_whole_filtered":
+        from .sam import SamDataset
+        train_set = SamDataset(image_folder_path=MyPath.db_root_dir("sam_images"), caption_folder_path=MyPath.db_root_dir("sam_captions"), id_file=MyPath.db_root_dir("sam_whole_filtered_ids_train"), id_dict_file=MyPath.db_root_dir("sam_id_dict"), transforms=transformation).subsample(train_subsample)
+    elif dataset_name == "sam_whole_filtered_val":
+        from .sam import SamDataset
+        train_set = SamDataset(image_folder_path=MyPath.db_root_dir("sam_images"), caption_folder_path=MyPath.db_root_dir("sam_captions"), id_file=MyPath.db_root_dir("sam_whole_filtered_ids_val"), id_dict_file=MyPath.db_root_dir("sam_id_dict"), transforms=transformation).subsample(train_subsample)
+        return {"val": train_set}
+    elif dataset_name == "lhq_sub100":
+        from .lhq import LhqDataset
+        train_set = LhqDataset(image_folder_path=MyPath.db_root_dir("lhq_images"), caption_folder_path=MyPath.db_root_dir("lhq_captions"), id_file=MyPath.db_root_dir("lhq_ids_sub100"), transforms=transformation)
+    elif dataset_name == "lhq_sub500":
+        from .lhq import LhqDataset
+        train_set = LhqDataset(image_folder_path=MyPath.db_root_dir("lhq_images"), caption_folder_path=MyPath.db_root_dir("lhq_captions"), id_file=MyPath.db_root_dir("lhq_ids_sub500"), transforms=transformation)
+    elif dataset_name == "lhq_sub9":
+        from .lhq import LhqDataset
+        train_set = LhqDataset(image_folder_path=MyPath.db_root_dir("lhq_images"), caption_folder_path=MyPath.db_root_dir("lhq_captions"), id_file=MyPath.db_root_dir("lhq_ids_sub9"), transforms=transformation)
+    elif dataset_name == "custom_coco100":
+        from .coco import CustomCocoCaptions
+        train_set = CustomCocoCaptions(root=MyPath.db_root_dir("coco_val"), annFile=MyPath.db_root_dir("coco_caption_val"),
+                           custom_file=MyPath.db_root_dir("custom_coco100_captions"), transforms=transformation)
+    elif dataset_name == "custom_coco500":
+        from .coco import CustomCocoCaptions
+        train_set = CustomCocoCaptions(root=MyPath.db_root_dir("coco_val"), annFile=MyPath.db_root_dir("coco_caption_val"),
+                           custom_file=MyPath.db_root_dir("custom_coco500_captions"), transforms=transformation)
+    elif dataset_name == "laion_pop500":
+        from .custom_caption import Laion_pop
+        train_set = Laion_pop(anno_file=MyPath.db_root_dir("laion_pop500"), image_root=MyPath.db_root_dir("laion_images"), transform=transformation)
+    elif dataset_name == "laion_pop500_first_sentence":
+        from .custom_caption import Laion_pop
+        train_set = Laion_pop(anno_file=MyPath.db_root_dir("laion_pop500_first_sentence"), image_root=MyPath.db_root_dir("laion_images"), transform=transformation)
+    else:
+        try:
+            train_set = load_dataset('imagefolder', data_dir = dataset_name, split="train")
+            val_set = deepcopy(train_set)
+            if val_subsample is not None and val_subsample != -1:
+                val_set = val_set.shuffle(seed=0).select(range(val_subsample))
+            return {"train": train_set, "val": val_set if get_val else None}
+        except:
+            raise ValueError(f"dataset_name {dataset_name} not found.")
+    return {"train": train_set, "val": deepcopy(train_set).subsample(val_subsample) if get_val else None}
+class MergeDataset(Dataset):
+    @staticmethod
+    def get_merged_dataset(dataset_names:list, transformation=None, train_subsample:int =None, val_subsample:int = 10000):
+        train_datasets = []
+        val_datasets = []
+        for dataset_name in dataset_names:
+            datasets = get_dataset(dataset_name, transformation, train_subsample, val_subsample)
+            train_datasets.append(datasets["train"])
+            val_datasets.append(datasets["val"])
+        train_datasets = MergeDataset(train_datasets).subsample(train_subsample)
+        val_datasets = MergeDataset(val_datasets).subsample(val_subsample)
+        return {"train": train_datasets, "val": val_datasets}
+    def __init__(self, datasets:list):
+        self.datasets = datasets
+        self.column_names = self.datasets[0].column_names
+        # self.ids = []
+        # start = 0
+        # for dataset in self.datasets:
+        #     self.ids += [i+start for i in dataset.ids]
+    def define_resolution(self, resolution: int):
+        for dataset in self.datasets:
+            dataset.define_resolution(resolution)
+    def __len__(self):
+        return sum([len(dataset) for dataset in self.datasets])
+    def __getitem__(self, index):
+        for i,dataset in enumerate(self.datasets):
+            if index < len(dataset):
+                ret = dataset[index]
+                ret["id"] = index
+                ret["dataset"] = i
+                return ret
+            index -= len(dataset)
+        raise IndexError
+    def subsample(self, num:int):
+        if num is None:
+            return self
+        dataset_ratio = np.array([len(dataset) for dataset in self.datasets]) / len(self)
+        new_datasets = []
+        for i, dataset in enumerate(self.datasets):
+            new_datasets.append(dataset.subsample(int(num*dataset_ratio[i])))
+        return MergeDataset(new_datasets)
+    def with_transform(self, transform):
+        for dataset in self.datasets:
+            dataset.with_transform(transform)
+        return self

custom_datasets/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (5.8 kB). View file

custom_datasets/__pycache__/coco.cpython-39.pyc ADDED Viewed

Binary file (10.4 kB). View file

custom_datasets/__pycache__/imagepair.cpython-39.pyc ADDED Viewed

Binary file (8.93 kB). View file

custom_datasets/__pycache__/mypath.cpython-39.pyc ADDED Viewed

Binary file (1.49 kB). View file

custom_datasets/coco.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import os.path
+from typing import Any, Callable, List, Optional, Tuple
+from PIL import Image
+from torchvision.datasets.vision import VisionDataset
+import pickle
+import csv
+import pandas as pd
+import torch
+import torchvision
+import re
+# from torchvision.datasets import CocoDetection
+# from utils.clip_filter import Clip_filter
+from tqdm import tqdm
+from .mypath import MyPath
+class CocoDetection(VisionDataset):
+    """`MS Coco Detection <https://cocodataset.org/#detection-2016>`_ Dataset.
+    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
+    Args:
+        root (string): Root directory where images are downloaded to.
+        annFile (string): Path to json annotation file.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.PILToTensor``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+    """
+    def __init__(
+            self,
+            root: str ,
+            annFile: str,
+            transform: Optional[Callable] = None,
+            target_transform: Optional[Callable] = None,
+            transforms: Optional[Callable] = None,
+            get_img=True,
+            get_cap=True
+    ) -> None:
+        super().__init__(root, transforms, transform, target_transform)
+        from pycocotools.coco import COCO
+        self.coco = COCO(annFile)
+        self.ids = list(sorted(self.coco.imgs.keys()))
+        self.column_names = ["image", "text"]
+        self.get_img = get_img
+        self.get_cap = get_cap
+    def _load_image(self, id: int) -> Image.Image:
+        path = self.coco.loadImgs(id)[0]["file_name"]
+        with open(os.path.join(self.root, path), 'rb') as f:
+            img = Image.open(f).convert("RGB")
+        return img
+    def _load_target(self, id: int) -> List[Any]:
+        return self.coco.loadAnns(self.coco.getAnnIds(id))
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        id = self.ids[index]
+        ret={"id":id}
+        if self.get_img:
+            image = self._load_image(id)
+            ret["image"] = image
+        if self.get_cap:
+            target = self._load_target(id)
+            ret["caption"] = [target]
+        if self.transforms is not None:
+            ret = self.transforms(ret)
+        return ret
+    def subsample(self, n: int = 10000):
+        if n is None or n == -1:
+            return self
+        ori_len = len(self)
+        assert n <= ori_len
+        # equal interval subsample
+        ids = self.ids[::ori_len // n][:n]
+        self.ids = ids
+        print(f"COCO dataset subsampled from {ori_len} to {len(self)}")
+        return self
+    def with_transform(self, transform):
+        self.transforms = transform
+        return self
+    def __len__(self) -> int:
+        # return 100
+        return len(self.ids)
+class CocoCaptions(CocoDetection):
+    """`MS Coco Captions <https://cocodataset.org/#captions-2015>`_ Dataset.
+    It requires the `COCO API to be installed <https://github.com/pdollar/coco/tree/master/PythonAPI>`_.
+    Args:
+        root (string): Root directory where images are downloaded to.
+        annFile (string): Path to json annotation file.
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.PILToTensor``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+        transforms (callable, optional): A function/transform that takes input sample and its target as entry
+            and returns a transformed version.
+    Example:
+        .. code:: python
+            import torchvision.datasets as dset
+            import torchvision.transforms as transforms
+            cap = dset.CocoCaptions(root = 'dir where images are',
+                                    annFile = 'json annotation file',
+                                    transform=transforms.PILToTensor())
+            print('Number of samples: ', len(cap))
+            img, target = cap[3] # load 4th sample
+            print("Image Size: ", img.size())
+            print(target)
+        Output: ::
+            Number of samples: 82783
+            Image Size: (3L, 427L, 640L)
+            [u'A plane emitting smoke stream flying over a mountain.',
+            u'A plane darts across a bright blue sky behind a mountain covered in snow',
+            u'A plane leaves a contrail above the snowy mountain top.',
+            u'A mountain that has a plane flying overheard in the distance.',
+            u'A mountain view with a plume of smoke in the background']
+    """
+    def _load_target(self, id: int) -> List[str]:
+        return [ann["caption"] for ann in super()._load_target(id)]
+class CocoCaptions_clip_filtered(CocoCaptions):
+    positive_prompt=["painting", "drawing", "graffiti",]
+    def __init__(
+            self,
+            root: str ,
+            annFile: str,
+            transform: Optional[Callable] = None,
+            target_transform: Optional[Callable] = None,
+            transforms: Optional[Callable] = None,
+            regenerate: bool = False,
+            id_file: Optional[str] = "/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/data/coco/coco_clip_filtered_ids.pickle"
+    ) -> None:
+        super().__init__(root, annFile, transform, target_transform, transforms)
+        os.makedirs(os.path.dirname(id_file), exist_ok=True)
+        if os.path.exists(id_file) and not regenerate:
+            with open(id_file, "rb") as f:
+                self.ids = pickle.load(f)
+        else:
+            self.ids, naive_filtered_num = self.naive_filter()
+            self.ids, clip_filtered_num = self.clip_filter(0.7)
+            print(f"naive Filtered {naive_filtered_num} images")
+            print(f"Clip Filtered {clip_filtered_num} images")
+            with open(id_file, "wb") as f:
+                pickle.dump(self.ids, f)
+                print(f"Filtered ids saved to {id_file}")
+        print(f"COCO filtered dataset size: {len(self)}")
+    def naive_filter(self, filter_prompt="painting"):
+        new_ids = []
+        naive_filtered_num = 0
+        for id in self.ids:
+            target = self._load_target(id)
+            filtered = False
+            for prompt in target:
+                if filter_prompt in prompt.lower():
+                    filtered = True
+                    naive_filtered_num += 1
+                    break
+                # if "artwork" in prompt.lower():
+                #     pass
+            if not filtered:
+                new_ids.append(id)
+        return new_ids, naive_filtered_num
+    # def clip_filter(self, threshold=0.7):
+    #
+    #     def collate_fn(examples):
+    #         # {"image": image, "text": [target], "id":id}
+    #         pixel_values = [example["image"] for example in examples]
+    #         prompts = [example["text"] for example in examples]
+    #         id = [example["id"] for example in examples]
+    #         return {"images": pixel_values, "prompts": prompts, "ids": id}
+    #
+    #
+    #     clip_filtered_num = 0
+    #     clip_filter = Clip_filter(positive_prompt=self.positive_prompt)
+    #     clip_logs={"positive_prompt":clip_filter.positive_prompt, "negative_prompt":clip_filter.negative_prompt,
+    #                "ids":torch.Tensor([]),"logits":torch.Tensor([])}
+    #     clip_log_file = "data/coco/clip_logs.pth"
+    #     new_ids = []
+    #     batch_size = 128
+    #     dataloader = torch.utils.data.DataLoader(self, batch_size=batch_size, num_workers=10, shuffle=False,
+    #                                              collate_fn=collate_fn)
+    #     for i, batch in enumerate(tqdm(dataloader)):
+    #         images = batch["images"]
+    #         filter_result, logits = clip_filter.filter(images, threshold=threshold)
+    #         ids = torch.IntTensor(batch["ids"])
+    #         clip_logs["ids"] = torch.cat([clip_logs["ids"], ids])
+    #         clip_logs["logits"] = torch.cat([clip_logs["logits"], logits])
+    #
+    #         new_ids.extend(ids[~filter_result].tolist())
+    #         clip_filtered_num += filter_result.sum().item()
+    #         if i % 50 == 0:
+    #             torch.save(clip_logs, clip_log_file)
+    #     torch.save(clip_logs, clip_log_file)
+    #
+    #     return new_ids, clip_filtered_num
+class CustomCocoCaptions(CocoCaptions):
+    def __init__(self, root: str=MyPath.db_root_dir("coco_val"), annFile: str=MyPath.db_root_dir("coco_caption_val"), custom_file:str="/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/jomat-code/filtering/ms_coco_captions_testset100.txt",transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, transforms: Optional[Callable] = None) -> None:
+        super().__init__(root, annFile, transform, target_transform, transforms)
+        self.column_names = ["image", "text"]
+        self.custom_file = custom_file
+        self.load_custom_data(custom_file)
+        self.transforms = transforms
+    def load_custom_data(self, custom_file):
+        self.custom_data = []
+        with open(custom_file, "r") as f:
+            data = f.readlines()
+        head = data[0].strip().split(",")
+        self.head = head
+        for line in data[1:]:
+            sub_data = line.strip().split(",")
+            if len(sub_data) > len(head):
+                sub_data_new = [sub_data[0]]
+                sub_data_new+=[",".join(sub_data[1:-1])]
+                sub_data_new.append(sub_data[-1])
+                sub_data = sub_data_new
+            assert len(sub_data) == len(head)
+            self.custom_data.append(sub_data)
+        # to pd
+        self.custom_data = pd.DataFrame(self.custom_data, columns=head)
+    def __len__(self) -> int:
+        return len(self.custom_data)
+    def __getitem__(self, index: int) -> Tuple[Any, Any]:
+        data = self.custom_data.iloc[index]
+        id = int(data["image_id"])
+        ret={"id":id}
+        if self.get_img:
+            image = self._load_image(id)
+            ret["image"] = image
+        if self.get_cap:
+            caption = data["caption"]
+            ret["caption"] = [caption]
+        ret["seed"] = int(data["random_seed"])
+        if self.transforms is not None:
+            ret = self.transforms(ret)
+        return ret
+def get_validation_set():
+    coco_instance = CocoDetection(root="/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/.datasets/coco_2017/train2017/", annFile="/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/.datasets/coco_2017/annotations/instances_train2017.json")
+    discard_cat_id = coco_instance.coco.getCatIds(supNms=["person", "animal"])
+    discard_img_id = []
+    for cat_id in discard_cat_id:
+        discard_img_id += coco_instance.coco.catToImgs[cat_id]
+    coco_clip_filtered = CocoCaptions_clip_filtered(root=MyPath.db_root_dir("coco_train"), annFile=MyPath.db_root_dir("coco_caption_train"),
+                                regenerate=False)
+    coco_clip_filtered_ids = coco_clip_filtered.ids
+    new_ids = set(coco_clip_filtered_ids) - set(discard_img_id)
+    new_ids = list(new_ids)
+    new_ids = random.sample(new_ids, 100)
+    with open("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/data/coco/coco_clip_filtered_subset100.pickle", "wb") as f:
+        pickle.dump(new_ids, f)
+if __name__ == "__main__":
+    from mypath import MyPath
+    import random
+    # get_validation_set()
+    # coco_filtered_remian_id = pickle.load(open("data/coco/coco_clip_filtered_ids.pickle", "rb"))
+    #
+    # coco_filtered_subset100 = random.sample(coco_filtered_remian_id, 100)
+    # save_path = "data/coco/coco_clip_filtered_subset100.pickle"
+    # with open(save_path, "wb") as f:
+    #     pickle.dump(coco_filtered_subset100, f)
+    # dataset = CocoCaptions_clip_filtered(root=MyPath.db_root_dir("coco_train"), annFile=MyPath.db_root_dir("coco_caption_train"),
+    #                                 regenerate=False)
+    dataset = CustomCocoCaptions(root=MyPath.db_root_dir("coco_val"), annFile=MyPath.db_root_dir("coco_caption_val"),
+                                 custom_file="/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/jomat-code/filtering/ms_coco_captions_testset100.txt")
+    dataset[0]

custom_datasets/custom_caption.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# Authors: Hui Ren (rhfeiyang.github.io)
+import torch
+import pandas as pd
+import numpy as np
+import os
+from PIL import Image
+class Caption_set(torch.utils.data.Dataset):
+    style_set_names=[
+        "andre-derain_subset1",
+        "andy_subset1",
+        "camille-corot_subset1",
+        "gerhard-richter_subset1",
+        "henri-matisse_subset1",
+        "katsushika-hokusai_subset1",
+        "klimt_subset3",
+        "monet_subset2",
+        "picasso_subset1",
+        "van_gogh_subset1",
+    ]
+    style_set_map={f"{name}":f"/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/clip_dissection/Style_captions/{name}/style_captions.csv" for name in style_set_names}
+    def __init__(self, prompts_path=None, set_name=None, transform=None):
+        assert prompts_path is not None or set_name is not None, "Either prompts_path or set_name should be provided"
+        if prompts_path is None:
+            prompts_path = self.style_set_map[set_name]
+        self.prompts = pd.read_csv(prompts_path, delimiter=';')
+        self.transform = transform
+    def __len__(self):
+        return len(self.prompts)
+    def __getitem__(self, idx):
+        ret={}
+        ret["id"] = idx
+        info = self.prompts.iloc[idx]
+        ret.update(info)
+        for k,v in ret.items():
+            if isinstance(v,np.int64):
+                ret[k] = int(v)
+        ret["caption"] = [ret["caption"]]
+        if self.transform:
+            ret = self.transform(ret)
+        return ret
+    def with_transform(self, transform):
+        self.transform = transform
+        return self
+class HRS_caption(Caption_set):
+    def __init__(self, prompts_path="/vision-nfs/torralba/projects/jomat/hui/stable_diffusion/clip_dissection/Style_captions/andre-derain_subset1/style_captions.csv", transform=None, delimiter=','):
+        self.prompts = pd.read_csv(prompts_path, delimiter=delimiter)
+        self.transform = transform
+        self.caption_key = "original_prompts"
+    def __getitem__(self, idx):
+        ret={}
+        ret["id"] = idx
+        info = self.prompts.iloc[idx]
+        ret["caption"] = [info[self.caption_key]]
+        ret["seed"] = idx
+        if self.transform:
+            ret = self.transform(ret)
+        return ret
+class Laion_pop(torch.utils.data.Dataset):
+    def __init__(self, anno_file="/vision-nfs/torralba/projects/jomat/hui/stable_diffusion/custom_datasets/laion_pop500.csv",image_root="/vision-nfs/torralba/scratch/jomat/sam_dataset/laion_pop",transform=None):
+        self.transform = transform
+        self.info = pd.read_csv(anno_file, delimiter=";")
+        self.caption_key = "caption"
+        self.image_root = image_root
+        self.get_img=True
+        self.get_caption=True
+    def __len__(self):
+        return len(self.info)
+    # def subsample(self, num:int):
+    #     self.data = self.data.select(range(num))
+    #     return self
+    def load_image(self, key):
+        image_path = os.path.join(self.image_root, f"{key:09}.jpg")
+        with open(image_path, "rb") as f:
+            image = Image.open(f).convert("RGB")
+        return image
+    def __getitem__(self, idx):
+        info = self.info.iloc[idx]
+        ret = {}
+        key = info["key"]
+        ret["id"] = key
+        if self.get_caption:
+            ret["caption"] = [info[self.caption_key]]
+        ret["seed"] = int(key)
+        if self.get_img:
+            ret["image"] = self.load_image(key)
+        if self.transform:
+            ret = self.transform(ret)
+        return ret
+    def with_transform(self, transform):
+        self.transform = transform
+        return self
+    def subset(self, ids:list):
+        self.info = self.info[self.info["key"].isin(ids)]
+        return self
+if __name__ == "__main__":
+    dataset = Caption_set("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/clip_dissection/Style_captions/andre-derain_subset1/style_captions.csv")
+    dataset[0]

custom_datasets/filt/coco/filt.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# Authors: Hui Ren (rhfeiyang.github.io)
+import os
+import sys
+import numpy as np
+from PIL import Image
+import pickle
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../"))
+from custom_datasets import get_dataset
+from utils.art_filter import Art_filter
+import torch
+from matplotlib import pyplot as plt
+import math
+import argparse
+import socket
+import time
+from tqdm import tqdm
+import torch
+def parse_args():
+    parser = argparse.ArgumentParser(description="Filter the coco dataset")
+    parser.add_argument("--check", action="store_true", help="Check the complete")
+    parser.add_argument("--mode", default="clip_logit", help="Filter mode: clip_logit, clip_filt, caption_filt")
+    parser.add_argument("--split" , default="val", help="Dataset split, val/train")
+    # parser.add_argument("--start_idx", default=0, type=int, help="Start index")
+    args = parser.parse_args()
+    return args
+def get_feat(save_path, dataloader, filter):
+    clip_feat_file = save_path
+    # compute_new = False
+    clip_feat={}
+    if os.path.exists(clip_feat_file):
+        with open(clip_feat_file, 'rb') as f:
+            clip_feat = pickle.load(f)
+    else:
+        print(f"computing clip feat",flush=True)
+        clip_feature_ret = filter.clip_feature(dataloader)
+        clip_feat["image_features"] = clip_feature_ret["clip_features"]
+        clip_feat["ids"] = clip_feature_ret["ids"]
+        with open(clip_feat_file, 'wb') as f:
+            pickle.dump(clip_feat, f)
+        print(f"clip_feat_result saved to {clip_feat_file}",flush=True)
+    return clip_feat
+def get_clip_logit(save_root, dataloader, filter):
+    feat_path = os.path.join(save_root, "clip_feat.pickle")
+    clip_feat = get_feat(feat_path, dataloader, filter)
+    clip_logits_file = os.path.join(save_root, "clip_logits.pickle")
+    # if clip_logit:
+    if os.path.exists(clip_logits_file):
+        with open(clip_logits_file, 'rb') as f:
+            clip_logits = pickle.load(f)
+    else:
+        clip_logits = filter.clip_logit_by_feat(clip_feat["image_features"])
+        clip_logits["ids"] = clip_feat["ids"]
+        with open(clip_logits_file, 'wb') as f:
+            pickle.dump(clip_logits, f)
+        print(f"clip_logits_result saved to {clip_logits_file}",flush=True)
+    return clip_logits
+def clip_filt(save_root, dataloader, filter):
+    clip_filt_file = os.path.join(save_root, "clip_filt_result.pickle")
+    if os.path.exists(clip_filt_file):
+        with open(clip_filt_file, 'rb') as f:
+            clip_filt_result = pickle.load(f)
+    else:
+        clip_logits = get_clip_logit(save_root, dataloader, filter)
+        clip_filt_result = filter.clip_filt(clip_logits)
+        with open(clip_filt_file, 'wb') as f:
+            pickle.dump(clip_filt_result, f)
+        print(f"clip_filt_result saved to {clip_filt_file}",flush=True)
+    return clip_filt_result
+def caption_filt(save_root, dataloader, filter):
+    caption_filt_file = os.path.join(save_root, "caption_filt_result.pickle")
+    if os.path.exists(caption_filt_file):
+        with open(caption_filt_file, 'rb') as f:
+            caption_filt_result = pickle.load(f)
+    else:
+        caption_filt_result = filter.caption_filt(dataloader)
+        with open(caption_filt_file, 'wb') as f:
+            pickle.dump(caption_filt_result, f)
+        print(f"caption_filt_result saved to {caption_filt_file}",flush=True)
+    return caption_filt_result
+def gather_result(save_dir, dataloader, filter):
+    all_remain_ids=[]
+    all_remain_ids_train=[]
+    all_remain_ids_val=[]
+    all_filtered_id_num = 0
+    clip_filt_result = clip_filt(save_dir, dataloader, filter)
+    caption_filt_result = caption_filt(save_dir, dataloader, filter)
+    caption_filtered_ids = [i[0] for i in caption_filt_result["filtered_ids"]]
+    all_filtered_id_num += len(set(clip_filt_result["filtered_ids"]) | set(caption_filtered_ids) )
+    remain_ids = set(clip_filt_result["remain_ids"]) & set(caption_filt_result["remain_ids"])
+    remain_ids = list(remain_ids)
+    remain_ids.sort()
+    with open(os.path.join(save_dir, "remain_ids.pickle"), 'wb') as f:
+        pickle.dump(remain_ids, f)
+    print(f"remain_ids saved to {save_dir}/remain_ids.pickle",flush=True)
+    return remain_ids
+@torch.no_grad()
+def main(args):
+    filter = Art_filter()
+    if args.mode == "caption_filt" or args.mode == "gather_result":
+        filter.clip_filter = None
+        torch.cuda.empty_cache()
+    # caption_folder_path = "/vision-nfs/torralba/scratch/jomat/sam_dataset/PixArt-alpha/captions"
+    # image_folder_path = "/vision-nfs/torralba/scratch/jomat/sam_dataset/images"
+    # id_dict_dir = "/vision-nfs/torralba/scratch/jomat/sam_dataset/images/id_dict"
+    # filt_dir = "/vision-nfs/torralba/scratch/jomat/sam_dataset/filt_result"
+    def collate_fn(examples):
+        # {"image": image, "id":id}
+        ret = {}
+        if "image" in examples[0]:
+            pixel_values = [example["image"] for example in examples]
+            ret["images"] = pixel_values
+        if "caption" in examples[0]:
+            # prompts = [example["caption"] for example in examples]
+            prompts = []
+            for example in examples:
+                if isinstance(example["caption"][0], list):
+                    prompts.append([" ".join(example["caption"][0])])
+                else:
+                    prompts.append(example["caption"])
+            ret["text"] = prompts
+        id = [example["id"] for example in examples]
+        ret["ids"] = id
+        return ret
+    if args.split == "val":
+        dataset = get_dataset("coco_val")["val"]
+    elif args.split == "train":
+        dataset = get_dataset("coco_train", get_val=False)["train"]
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=False, num_workers=8, collate_fn=collate_fn)
+    error_files=[]
+    save_root = f"/vision-nfs/torralba/scratch/jomat/sam_dataset/coco/filt/{args.split}"
+    os.makedirs(save_root, exist_ok=True)
+    if args.mode == "clip_feat":
+        feat_path = os.path.join(save_root, "clip_feat.pickle")
+        clip_feat = get_feat(feat_path, dataloader, filter)
+    if args.mode == "clip_logit":
+        clip_logit = get_clip_logit(save_root, dataloader, filter)
+    if args.mode == "clip_filt":
+        # if os.path.exists(clip_filt_file):
+        #     with open(clip_filt_file, 'rb') as f:
+        #         ret = pickle.load(f)
+        # else:
+        clip_filt_result = clip_filt(save_root, dataloader, filter)
+    if args.mode == "caption_filt":
+        caption_filt_result = caption_filt(save_root, dataloader, filter)
+    if args.mode == "gather_result":
+        filtered_result = gather_result(save_root, dataloader, filter)
+    print("finished",flush=True)
+    for file in error_files:
+        # os.remove(file)
+        print(file,flush=True)
+if __name__ == "__main__":
+    args = parse_args()
+    log_file = "sam_filt"
+    idx=0
+    hostname = socket.gethostname()
+    now_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
+    while os.path.exists(f"{log_file}_{hostname}_check{args.check}_{now_time}_{idx}.log"):
+        idx+=1
+    main(args)
+    # clip_logits_analysis()

custom_datasets/filt/sam_filt.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Authors: Hui Ren (rhfeiyang.github.io)
+import os
+import sys
+import numpy as np
+from PIL import Image
+import pickle
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
+from custom_datasets.sam import SamDataset
+from utils.art_filter import Art_filter
+import torch
+from matplotlib import pyplot as plt
+import math
+import argparse
+import socket
+import time
+from tqdm import tqdm
+def parse_args():
+    parser = argparse.ArgumentParser(description="Filter the sam dataset")
+    parser.add_argument("--check", action="store_true", help="Check the complete")
+    parser.add_argument("--mode", default="clip_logit",  choices=["clip_logit_update","clip_logit", "clip_filt", "caption_filt", "gather_result","caption_flit_append"])
+    parser.add_argument("--start_idx", default=0, type=int, help="Start index")
+    parser.add_argument("--end_idx", default=9e10, type=int, help="Start index")
+    args = parser.parse_args()
+    return args
+@torch.no_grad()
+def main(args):
+    filter = Art_filter()
+    if args.mode == "caption_filt" or args.mode == "gather_result":
+        filter.clip_filter = None
+        torch.cuda.empty_cache()
+    caption_folder_path = "/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/clip_dissection/SAM/subset/captions"
+    image_folder_path = "/vision-nfs/torralba/scratch/jomat/sam_dataset/nfs-data/sam/images"
+    id_dict_dir = "/vision-nfs/torralba/scratch/jomat/sam_dataset/sam_ids/8.16/id_dict"
+    filt_dir = "/vision-nfs/torralba/scratch/jomat/sam_dataset/filt_result"
+    def collate_fn(examples):
+        # {"image": image, "id":id}
+        ret = {}
+        if "image" in examples[0]:
+            pixel_values = [example["image"] for example in examples]
+            ret["images"] = pixel_values
+        if "text" in examples[0]:
+            prompts = [example["text"] for example in examples]
+            ret["text"] = prompts
+        id = [example["id"] for example in examples]
+        ret["ids"] = id
+        return ret
+    error_files=[]
+    val_set = ["sa_000000"]
+    result_check_set = ["sa_000020"]
+    all_remain_ids=[]
+    all_remain_ids_train=[]
+    all_remain_ids_val=[]
+    all_filtered_id_num = 0
+    remain_feat_num = 0
+    remain_caption_num = 0
+    filter_feat_num = 0
+    filter_caption_num = 0
+    for idx,file in tqdm(enumerate(sorted(os.listdir(id_dict_dir)))):
+        if idx < args.start_idx or idx >= args.end_idx:
+            continue
+        if file.endswith(".pickle") and not file.startswith("all"):
+            print("=====================================")
+            print(file,flush=True)
+            save_dir = os.path.join(filt_dir, file.replace("_id_dict.pickle", ""))
+            if not os.path.exists(save_dir):
+                os.makedirs(save_dir, exist_ok=True)
+            id_dict_file = os.path.join(id_dict_dir, file)
+            with open(id_dict_file, 'rb') as f:
+                id_dict = pickle.load(f)
+            ids = list(id_dict.keys())
+            dataset = SamDataset(image_folder_path, caption_folder_path, id_file=ids, id_dict_file=id_dict_file)
+            # dataset = SamDataset(image_folder_path, caption_folder_path, id_file=[10061410, 10076945, 10310013,1042012, 4487809, 4541052], id_dict_file="/vision-nfs/torralba/scratch/jomat/sam_dataset/images/id_dict/all_id_dict.pickle")
+            dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=False, num_workers=8, collate_fn=collate_fn)
+            clip_logits = None
+            clip_logits_file = os.path.join(save_dir, "clip_logits_result.pickle")
+            clip_filt_file = os.path.join(save_dir, "clip_filt_result.pickle")
+            caption_filt_file = os.path.join(save_dir, "caption_filt_result.pickle")
+            if args.mode == "clip_feat":
+                compute_new = False
+                clip_logits = {}
+                if os.path.exists(clip_logits_file):
+                    with open(clip_logits_file, 'rb') as f:
+                        clip_logits = pickle.load(f)
+                    if "image_features" not in clip_logits:
+                        compute_new = True
+                else:
+                    compute_new=True
+                if compute_new:
+                    if clip_logits == '':
+                        clip_logits = {}
+                    print(f"compute clip_feat {file}",flush=True)
+                    clip_feature_ret = filter.clip_feature(dataloader)
+                    clip_logits["image_features"] = clip_feature_ret["clip_features"]
+                    if "ids" in clip_logits:
+                        assert clip_feature_ret["ids"] == clip_logits["ids"]
+                    else:
+                        clip_logits["ids"] = clip_feature_ret["ids"]
+                    with open(clip_logits_file, 'wb') as f:
+                        pickle.dump(clip_logits, f)
+                    print(f"clip_feat_result saved to {clip_logits_file}",flush=True)
+                else:
+                    print(f"skip {clip_logits_file}",flush=True)
+            if args.mode == "clip_logit":
+            # if clip_logit:
+                if os.path.exists(clip_logits_file):
+                    try:
+                        with open(clip_logits_file, 'rb') as f:
+                            clip_logits = pickle.load(f)
+                    except:
+                        continue
+                    skip = True
+                    if args.check and clip_logits=="":
+                        skip = False
+                else:
+                    skip = False
+                # skip = False
+                if not skip:
+                    # os.makedirs(os.path.join(save_dir, "tmp"), exist_ok=True)
+                    with open(clip_logits_file, 'wb') as f:
+                        pickle.dump("", f)
+                    try:
+                        clip_logits = filter.clip_logit(dataloader)
+                    except:
+                        print(f"Error in clip_logit {file}",flush=True)
+                        continue
+                    with open(clip_logits_file, 'wb') as f:
+                        pickle.dump(clip_logits, f)
+                    print(f"clip_logits_result saved to {clip_logits_file}",flush=True)
+                else:
+                    print(f"skip {clip_logits_file}",flush=True)
+            if args.mode == "clip_logit_update":
+                if os.path.exists(clip_logits_file):
+                    with open(clip_logits_file, 'rb') as f:
+                        clip_logits = pickle.load(f)
+                else:
+                    print(f"{clip_logits_file} not exist",flush=True)
+                    continue
+                if clip_logits == "":
+                    print(f"skip {clip_logits_file}",flush=True)
+                    continue
+                ret = filter.clip_logit_by_feat(clip_logits["clip_features"])
+                # assert (clip_logits["clip_logits"] - ret["clip_logits"]).abs().max() < 0.01
+                clip_logits["clip_logits"] = ret["clip_logits"]
+                clip_logits["text"] = ret["text"]
+                with open(clip_logits_file, 'wb') as f:
+                    pickle.dump(clip_logits, f)
+            if args.mode == "clip_filt":
+                # if os.path.exists(clip_filt_file):
+                #     with open(clip_filt_file, 'rb') as f:
+                #         ret = pickle.load(f)
+                # else:
+                if clip_logits is None:
+                    try:
+                        with open(clip_logits_file, 'rb') as f:
+                            clip_logits = pickle.load(f)
+                    except:
+                        print(f"Error in loading {clip_logits_file}",flush=True)
+                        error_files.append(clip_logits_file)
+                        continue
+                    if clip_logits == "":
+                        print(f"skip {clip_logits_file}",flush=True)
+                        error_files.append(clip_logits_file)
+                        continue
+                clip_filt_result = filter.clip_filt(clip_logits)
+                with open(clip_filt_file, 'wb') as f:
+                    pickle.dump(clip_filt_result, f)
+                print(f"clip_filt_result saved to {clip_filt_file}",flush=True)
+            if args.mode == "caption_filt":
+                if os.path.exists(caption_filt_file):
+                    try:
+                        with open(caption_filt_file, 'rb') as f:
+                            ret = pickle.load(f)
+                    except:
+                        continue
+                    skip = True
+                    if args.check and ret=="":
+                        skip = False
+                        # os.remove(caption_filt_file)
+                        print(f"empty {caption_filt_file}",flush=True)
+                        # skip = True
+                else:
+                    skip = False
+                if not skip:
+                    with open(caption_filt_file, 'wb') as f:
+                        pickle.dump("", f)
+                    # try:
+                    ret = filter.caption_filt(dataloader)
+                    # except:
+                    #     print(f"Error in filtering {file}",flush=True)
+                    #     continue
+                    with open(caption_filt_file, 'wb') as f:
+                        pickle.dump(ret, f)
+                    print(f"caption_filt_result saved to {caption_filt_file}",flush=True)
+                else:
+                    print(f"skip {caption_filt_file}",flush=True)
+            if args.mode == "caption_flit_append":
+                if not os.path.exists(caption_filt_file):
+                    print(f"{caption_filt_file} not exist",flush=True)
+                    continue
+                with open(caption_filt_file, 'rb') as f:
+                    old_caption_filt_result = pickle.load(f)
+                skip = True
+                for i in filter.caption_filter.filter_prompts:
+                    if i not in old_caption_filt_result["filter_prompts"]:
+                        skip = False
+                        break
+                if skip:
+                    print(f"skip {caption_filt_file}",flush=True)
+                    continue
+                old_remain_ids = old_caption_filt_result["remain_ids"]
+                new_dataset = SamDataset(image_folder_path, caption_folder_path, id_file=old_remain_ids, id_dict_file=id_dict_file)
+                new_dataloader = torch.utils.data.DataLoader(new_dataset, batch_size=64, shuffle=False, num_workers=8, collate_fn=collate_fn)
+                ret = filter.caption_filt(new_dataloader)
+                old_caption_filt_result["remain_ids"] = ret["remain_ids"]
+                old_caption_filt_result["filtered_ids"].extend(ret["filtered_ids"])
+                new_filter_count = ret["filter_count"].copy()
+                for i in range(len(old_caption_filt_result["filter_count"])):
+                    new_filter_count[i] += old_caption_filt_result["filter_count"][i]
+                old_caption_filt_result["filter_count"] = new_filter_count
+                old_caption_filt_result["filter_prompts"] = ret["filter_prompts"]
+                with open(caption_filt_file, 'wb') as f:
+                    pickle.dump(old_caption_filt_result, f)
+            if args.mode == "gather_result":
+                with open(clip_filt_file, 'rb') as f:
+                    clip_filt_result = pickle.load(f)
+                with open(caption_filt_file, 'rb') as f:
+                    caption_filt_result = pickle.load(f)
+                caption_filtered_ids = [i[0] for i in caption_filt_result["filtered_ids"]]
+                all_filtered_id_num += len(set(clip_filt_result["filtered_ids"]) | set(caption_filtered_ids) )
+                remain_feat_num += len(clip_filt_result["remain_ids"])
+                remain_caption_num += len(caption_filt_result["remain_ids"])
+                filter_feat_num += len(clip_filt_result["filtered_ids"])
+                filter_caption_num += len(caption_filtered_ids)
+                remain_ids = set(clip_filt_result["remain_ids"]) & set(caption_filt_result["remain_ids"])
+                remain_ids = list(remain_ids)
+                remain_ids.sort()
+                # with open(os.path.join(save_dir, "remain_ids.pickle"), 'wb') as f:
+                #     pickle.dump(remain_ids, f)
+                # print(f"remain_ids saved to {save_dir}/remain_ids.pickle",flush=True)
+                all_remain_ids.extend(remain_ids)
+                if file.replace("_id_dict.pickle","") in val_set:
+                    all_remain_ids_val.extend(remain_ids)
+                else:
+                    all_remain_ids_train.extend(remain_ids)
+    if args.mode == "gather_result":
+        print(f"filtered ids: {all_filtered_id_num}",flush=True)
+        print(f"remain feat num: {remain_feat_num}",flush=True)
+        print(f"remain caption num: {remain_caption_num}",flush=True)
+        print(f"filter feat num: {filter_feat_num}",flush=True)
+        print(f"filter caption num: {filter_caption_num}",flush=True)
+        all_remain_ids.sort()
+        with open(os.path.join(filt_dir, "all_remain_ids.pickle"), 'wb') as f:
+            pickle.dump(all_remain_ids, f)
+        with open(os.path.join(filt_dir, "all_remain_ids_train.pickle"), 'wb') as f:
+            pickle.dump(all_remain_ids_train, f)
+        with open(os.path.join(filt_dir, "all_remain_ids_val.pickle"), 'wb') as f:
+            pickle.dump(all_remain_ids_val, f)
+        print(f"all_remain_ids saved to {filt_dir}/all_remain_ids.pickle",flush=True)
+        print(f"all_remain_ids_train saved to {filt_dir}/all_remain_ids_train.pickle",flush=True)
+        print(f"all_remain_ids_val saved to {filt_dir}/all_remain_ids_val.pickle",flush=True)
+    print("finished",flush=True)
+    for file in error_files:
+        # os.remove(file)
+        print(file,flush=True)
+if __name__ == "__main__":
+    args = parse_args()
+    log_file = "sam_filt"
+    idx=0
+    hostname = socket.gethostname()
+    now_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
+    while os.path.exists(f"{log_file}_{hostname}_check{args.check}_{now_time}_{idx}.log"):
+        idx+=1
+    main(args)
+    # clip_logits_analysis()

custom_datasets/imagepair.py ADDED Viewed

	@@ -0,0 +1,240 @@

+# Authors: Hui Ren (rhfeiyang.github.io)
+import random
+import torch.utils.data as data
+from PIL import Image
+import os
+import torch
+# from tqdm import tqdm
+class ImageSet(data.Dataset):
+    def __init__(self, folder , transform=None, keep_in_mem=True, caption=None):
+        self.path = folder
+        self.transform = transform
+        self.caption_path = None
+        self.images = []
+        self.captions = []
+        self.keep_in_mem = keep_in_mem
+        if not isinstance(folder, list):
+            self.image_files = [file for file in os.listdir(folder) if file.endswith((".png",".jpg"))]
+            self.image_files.sort()
+        else:
+            self.images = folder
+        if not isinstance(caption, list):
+            if caption not in [None, "", "None"]:
+                self.caption_path = caption
+                self.caption_files = [os.path.join(caption, file.replace(".png", ".txt").replace(".jpg", ".txt")) for file in self.image_files]
+                self.caption_files.sort()
+        else:
+            self.caption_path = True
+            self.captions = caption
+        # get all the image files png/jpg
+        if keep_in_mem:
+            if len(self.images) == 0:
+                for file in self.image_files:
+                    img = self.load_image(os.path.join(self.path, file))
+                    self.images.append(img)
+            if len(self.captions) == 0:
+                if self.caption_path is not None:
+                    self.captions = []
+                    for file in self.caption_files:
+                        caption = self.load_caption(file)
+                        self.captions.append(caption)
+        else:
+            self.images = None
+    def limit_num(self, n):
+        raise NotImplementedError
+        assert n <= len(self), f"n should be less than the length of the dataset {len(self)}"
+        self.image_files = self.image_files[:n]
+        self.caption_files = self.caption_files[:n]
+        if self.keep_in_mem:
+            self.images = self.images[:n]
+            self.captions = self.captions[:n]
+        print(f"Dataset limited to {n}")
+    def __len__(self):
+        if len(self.images) != 0:
+            return len(self.images)
+        else:
+            return len(self.image_files)
+    def load_image(self, path):
+        with open(path, 'rb') as f:
+            img = Image.open(f).convert('RGB')
+        return img
+    def load_caption(self, path):
+        with open(path, 'r') as f:
+            caption = f.readlines()
+        caption = [line.strip() for line in caption if len(line.strip()) > 0]
+        return caption
+    def __getitem__(self, index):
+        if len(self.images) != 0:
+            img = self.images[index]
+        else:
+            img = self.load_image(os.path.join(self.path, self.image_files[index]))
+        # if self.transform is not None:
+        #     img = self.transform(img)
+        if self.caption_path is not None or len(self.captions) != 0:
+            if len(self.captions) != 0:
+                caption = self.captions[index]
+            else:
+                caption = self.load_caption(self.caption_files[index])
+            ret= {"image": img, "caption": caption, "id": index}
+        else:
+            ret= {"image": img, "id": index}
+        if self.transform is not None:
+            ret = self.transform(ret)
+        return ret
+    def subsample(self, n: int = 10):
+        if n is None or n == -1:
+            return self
+        ori_len = len(self)
+        assert n <= ori_len
+        # equal interval subsample
+        ids = self.image_files[::ori_len // n][:n]
+        self.image_files = ids
+        if self.keep_in_mem:
+            self.images = self.images[::ori_len // n][:n]
+        print(f"Dataset subsampled from {ori_len} to {len(self)}")
+        return self
+    def with_transform(self, transform):
+        self.transform = transform
+        return self
+    @staticmethod
+    def collate_fn(examples):
+        images = [example["image"] for example in examples]
+        ids = [example["id"] for example in examples]
+        if "caption" in examples[0]:
+            captions = [random.choice(example["caption"]) for example in examples]
+            return {"images": images, "captions": captions, "id": ids}
+        else:
+            return {"images": images, "id": ids}
+class ImagePair(ImageSet):
+    def __init__(self, folder1, folder2, transform=None, keep_in_mem=True):
+        self.path1 = folder1
+        self.path2 = folder2
+        self.transform = transform
+        # get all the image files png/jpg
+        self.image_files = [file for file in os.listdir(folder1) if file.endswith(".png") or file.endswith(".jpg")]
+        self.image_files.sort()
+        self.keep_in_mem = keep_in_mem
+        if keep_in_mem:
+            self.images = []
+            for file in self.image_files:
+                img1 = self.load_image(os.path.join(self.path1, file))
+                img2 = self.load_image(os.path.join(self.path2, file))
+                self.images.append((img1, img2))
+        else:
+            self.images = None
+    def __getitem__(self, index):
+        if self.keep_in_mem:
+            img1, img2 = self.images[index]
+        else:
+            img1 = self.load_image(os.path.join(self.path1, self.image_files[index]))
+            img2 = self.load_image(os.path.join(self.path2, self.image_files[index]))
+        if self.transform is not None:
+            img1 = self.transform(img1)
+            img2 = self.transform(img2)
+        return {"image1": img1, "image2": img2, "id": index}
+    @staticmethod
+    def collate_fn(examples):
+        images1 = [example["image1"] for example in examples]
+        images2 = [example["image2"] for example in examples]
+        # images1 = torch.stack(images1)
+        # images2 = torch.stack(images2)
+        ids = [example["id"] for example in examples]
+        return {"image1": images1, "image2": images2, "id": ids}
+    def push_to_huggingface(self, hug_folder):
+        from datasets import Dataset
+        from datasets import Image as HugImage
+        photo_path = [os.path.join(self.path1, file) for file in self.image_files]
+        sketch_path = [os.path.join(self.path2, file) for file in self.image_files]
+        dataset = Dataset.from_dict({"photo": photo_path, "sketch": sketch_path, "file_name": self.image_files})
+        dataset = dataset.cast_column("photo", HugImage())
+        dataset = dataset.cast_column("sketch", HugImage())
+        dataset.push_to_hub(hug_folder, private=True)
+class ImageClass(ImageSet):
+    def __init__(self, folders: list, transform=None, keep_in_mem=True):
+        self.paths = folders
+        self.transform = transform
+        # get all the image files png/jpg
+        self.image_files = []
+        self.keep_in_mem = keep_in_mem
+        for i, folder in enumerate(folders):
+            self.image_files+=[(os.path.join(folder, file), i) for file in os.listdir(folder) if file.endswith(".png") or file.endswith(".jpg")]
+        if keep_in_mem:
+            self.images = []
+            print("Loading images to memory")
+            for file in self.image_files:
+                img = self.load_image(file[0])
+                self.images.append((img, file[1]))
+            print("Loading images to memory done")
+        else:
+            self.images = None
+    def __getitem__(self, index):
+        if self.keep_in_mem:
+            img, label = self.images[index]
+        else:
+            img_path, label = self.image_files[index]
+            img = self.load_image(img_path)
+        if self.transform is not None:
+            img = self.transform(img)
+        return {"image": img, "label": label, "id": index}
+    @staticmethod
+    def collate_fn(examples):
+        images = [example["image"] for example in examples]
+        labels = [example["label"] for example in examples]
+        ids = [example["id"] for example in examples]
+        return {"images": images, "labels":labels, "id": ids}
+if __name__ == "__main__":
+    # dataset = ImagePair("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/imgFolder/clip_filtered_remain_50",
+    #                     "/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/imgFolder/sketch_50",keep_in_mem=False)
+    # dataset.push_to_huggingface("rhfeiyang/photo-sketch-pair-50")
+    dataset = ImagePair("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/imgFolder/clip_filtered_remain_500",
+                        "/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/imgFolder/sketch_500",
+                        keep_in_mem=True)
+    # dataset.push_to_huggingface("rhfeiyang/photo-sketch-pair-500")
+    # ret = dataset[0]
+    # print(len(dataset))
+    import torch
+    from torchvision import transforms
+    train_transforms = transforms.Compose(
+        [
+            transforms.Resize(256, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(256),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+    dataset = dataset.with_transform(train_transforms)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True, num_workers=4, collate_fn=ImagePair.collate_fn)
+    ret = dataloader.__iter__().__next__()
+    pass

custom_datasets/lhq.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# Authors: Hui Ren (rhfeiyang.github.io)
+import os
+import pickle
+import random
+import shutil
+from torch.utils.data import Dataset
+from torchvision import transforms
+from PIL import Image
+class LhqDataset(Dataset):
+    def __init__(self, image_folder_path:str, caption_folder_path:str, id_file:str = "clip_dissection/lhq/idx/subsample_100.pickle", transforms: transforms = None,
+                 get_img=True,
+                 get_cap=True,):
+        if isinstance(id_file, list):
+            self.ids = id_file
+        elif isinstance(id_file, str):
+            with open(id_file, 'rb') as f:
+                print(f"Loading ids from {id_file}", flush=True)
+                self.ids = pickle.load(f)
+                print(f"Loaded ids from {id_file}", flush=True)
+        self.image_folder_path = image_folder_path
+        self.caption_folder_path = caption_folder_path
+        self.transforms = transforms
+        self.column_names = ["image", "text"]
+        self.get_img = get_img
+        self.get_cap = get_cap
+    def __len__(self):
+        return len(self.ids)
+    def __getitem__(self, index: int):
+        id = self.ids[index]
+        ret={"id":id}
+        if self.get_img:
+            image = self._load_image(id)
+            ret["image"]=image
+        if self.get_cap:
+            target = self._load_caption(id)
+            ret["caption"]=[target]
+        if self.transforms is not None:
+            ret = self.transforms(ret)
+        return ret
+    def _load_image(self, id: int):
+        image_path = f"{self.image_folder_path}/{id}.jpg"
+        with open(image_path, 'rb') as f:
+            img = Image.open(f).convert("RGB")
+        return img
+    def _load_caption(self, id: int):
+        caption_path = f"{self.caption_folder_path}/{id}.txt"
+        with open(caption_path, 'r') as f:
+            caption_file = f.read()
+        caption = []
+        for line in caption_file.split("\n"):
+            line = line.strip()
+            if len(line) > 0:
+                caption.append(line)
+        return caption
+    def subsample(self, n: int = 10000):
+        if n is None or n == -1:
+            return self
+        ori_len = len(self)
+        assert n <= ori_len
+        # equal interval subsample
+        ids = self.ids[::ori_len // n][:n]
+        self.ids = ids
+        print(f"LHQ dataset subsampled from {ori_len} to {len(self)}")
+        return self
+    def with_transform(self, transform):
+        self.transforms = transform
+        return self
+def generate_idx(data_folder = "/data/vision/torralba/clip_dissection/huiren/lhq/lhq_1024_jpg/lhq_1024_jpg/", save_path = "/data/vision/torralba/clip_dissection/huiren/lhq/idx/all_ids.pickle"):
+    all_ids = os.listdir(data_folder)
+    all_ids = [i.split(".")[0] for i in all_ids if i.endswith(".jpg") or i.endswith(".png")]
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    pickle.dump(all_ids, open(f"{save_path}", "wb"))
+    print("all_ids generated")
+    return all_ids
+def random_sample(all_ids, sample_num = 110, save_root = "/data/vision/torralba/clip_dissection/huiren/lhq/subsample"):
+    chosen_id = random.sample(all_ids, sample_num)
+    save_dir = f"{save_root}/{sample_num}"
+    os.makedirs(save_dir, exist_ok=True)
+    for id in chosen_id:
+        img_path = f"/data/vision/torralba/clip_dissection/huiren/lhq/lhq_1024_jpg/lhq_1024_jpg/{id}.jpg"
+        shutil.copy(img_path, save_dir)
+    return chosen_id
+if __name__ == "__main__":
+    # all_ids = generate_idx()
+    # with open("/data/vision/torralba/clip_dissection/huiren/lhq/idx/all_ids.pickle", "rb") as f:
+    #     all_ids = pickle.load(f)
+    # # random_sample(all_ids, 1)
+    #
+    # # generate_idx(data_folder="/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/clip_dissection/lhq/subsample/100",
+    # #              save_path="/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/clip_dissection/lhq/idx/subsample_100.pickle")
+    #
+    # # lhq 500
+    # with open("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/clip_dissection/lhq/idx/subsample_100.pickle", "rb") as f:
+    #     lhq_100_idx = pickle.load(f)
+    #
+    # extra_idx = set(all_ids) - set(lhq_100_idx)
+    # add_idx = random.sample(extra_idx, 400)
+    # lhq_500_idx = lhq_100_idx + add_idx
+    # with open("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/clip_dissection/lhq/idx/subsample_500.pickle", "wb") as f:
+    #     pickle.dump(lhq_500_idx, f)
+    # save_dir = "/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/clip_dissection/lhq/subsample/500"
+    # os.makedirs(save_dir, exist_ok=True)
+    # for id in lhq_500_idx:
+    #     img_path = f"/data/vision/torralba/clip_dissection/huiren/lhq/lhq_1024_jpg/lhq_1024_jpg/{id}.jpg"
+    #     # softlink
+    #     os.symlink(img_path, os.path.join(save_dir, f"{id}.jpg"))
+    # lhq9
+    all_ids = generate_idx(data_folder="/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/clip_dissection/lhq/subsample/9",
+                           save_path="/data/vision/torralba/clip_dissection/huiren/lhq/idx/subsample_9.pickle")
+    print(all_ids)

custom_datasets/mypath.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+class MyPath(object):
+    @staticmethod
+    def db_root_dir(database=''):
+        coco_root = "/data/vision/torralba/datasets/coco_2017"
+        sam_caption_root = "/vision-nfs/torralba/datasets/vision/sam/captions"
+        root_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        map={
+            "coco_train": f"{coco_root}/train2017/",
+            "coco_caption_train": f"{coco_root}/annotations/captions_train2017.json",
+            "coco_val": f"{coco_root}/val2017/",
+            "coco_caption_val": f"{coco_root}/annotations/captions_val2017.json",
+            "sam_images": "/vision-nfs/torralba/datasets/vision/sam/images",
+            "sam_captions": sam_caption_root,
+            "sam_whole_filtered_ids_train": "data/filtered_sam/all_remain_ids_train.pickle",
+            "sam_whole_filtered_ids_val": "data/filtered_sam/all_remain_ids_val.pickle",
+            "sam_id_dict": "data/filtered_sam/all_id_dict.pickle",
+            "lhq_ids_sub500": "data/LHQ500_caption/idx/subsample_500.pickle",
+            "lhq_images": "data/LHQ500_caption/subsample_500",
+            "lhq_captions": "data/LHQ500_caption/captions",
+        }
+        ret = map.get(database, None)
+        if ret is None:
+            raise NotImplementedError
+        return ret

custom_datasets/sam.py ADDED Viewed

	@@ -0,0 +1,160 @@

+# Authors: Hui Ren (rhfeiyang.github.io)
+import os.path
+import sys
+from typing import Any, Callable, List, Optional, Tuple
+import tqdm
+from PIL import Image
+from torch.utils.data import Dataset
+import pickle
+from torchvision import transforms
+# import torch
+# import torchvision
+# import re
+class SamDataset(Dataset):
+    def __init__(self, image_folder_path:str, caption_folder_path:str, id_file:str = "data/sam/clip_filtered_ids.pickle",id_dict_file:str =None , transforms: Optional[Callable] = None,
+                 resolution=None,
+                 get_img=True,
+                 get_cap=True,):
+        if id_dict_file is not None:
+            with open(id_dict_file, 'rb') as f:
+                print(f"Loading id_dict from {id_dict_file}", flush=True)
+                self.id_dict = pickle.load(f)
+                print(f"Loaded id_dict from {id_dict_file}", flush=True)
+        else:
+            self.id_dict = None
+        if isinstance(id_file, list):
+            self.ids = id_file
+        elif isinstance(id_file, str):
+            with open(id_file, 'rb') as f:
+                print(f"Loading ids from {id_file}", flush=True)
+                self.ids = pickle.load(f)
+                print(f"Loaded ids from {id_file}", flush=True)
+        self.resolution = resolution
+        self.ori_image_folder_path = image_folder_path
+        if self.resolution is not None:
+            if os.path.exists("/var/jomat/datasets/"):
+                # self.image_folder_path = f"/var/jomat/datasets/SAM_{resolution}"
+                self.image_folder_path = f"{image_folder_path}_{resolution}"
+            else:
+                self.image_folder_path = f"{image_folder_path}_{resolution}"
+            os.makedirs(self.image_folder_path, exist_ok=True)
+        else:
+            self.image_folder_path = image_folder_path
+        self.caption_folder_path = caption_folder_path
+        self.transforms = transforms
+        self.column_names = ["image", "text"]
+        self.get_img = get_img
+        self.get_cap = get_cap
+    def __len__(self):
+        # return 100
+        return len(self.ids)
+    def __getitem__(self, index: int):
+        id = self.ids[index]
+        ret={"id":id}
+        try:
+            # if index == 1:
+            #     raise Exception("test")
+            if self.get_img:
+                image = self._load_image(id)
+                ret["image"]=image
+            if self.get_cap:
+                target = self._load_caption(id)
+                ret["text"] = [target]
+            if self.transforms is not None:
+                ret = self.transforms(ret)
+            return ret
+        except Exception as e:
+            raise e
+            print(f"Error loading image and caption for id {id}, error: {e}, redirecting to index 0", flush=True)
+            ret = self[0]
+            return ret
+    def define_resolution(self, resolution: int):
+        self.resolution = resolution
+        if os.path.exists("/var/jomat/datasets/"):
+            self.image_folder_path = f"/var/jomat/datasets/SAM_{resolution}"
+            # self.image_folder_path = f"{self.ori_image_folder_path}_{resolution}"
+        else:
+            self.image_folder_path = f"{self.ori_image_folder_path}_{resolution}"
+        print(f"SamDataset resolution defined to {resolution}, new image folder path: {self.image_folder_path}")
+    def _load_image(self, id: int) -> Image.Image:
+        if self.id_dict is not None:
+            subfolder = self.id_dict[id]
+            image_path = f"{self.image_folder_path}/{subfolder}/sa_{id}.jpg"
+        else:
+            image_path = f"{self.image_folder_path}/sa_{id}.jpg"
+        try:
+            with open(image_path, 'rb') as f:
+                img = Image.open(f).convert("RGB")
+            # return img
+        except:
+            # load original image
+            if self.id_dict is not None:
+                subfolder = self.id_dict[id]
+                ori_image_path = f"{self.ori_image_folder_path}/{subfolder}/sa_{id}.jpg"
+            else:
+                ori_image_path = f"{self.ori_image_folder_path}/sa_{id}.jpg"
+            assert os.path.exists(ori_image_path)
+            with open(ori_image_path, 'rb') as f:
+                img = Image.open(f).convert("RGB")
+            # resize image keep aspect ratio
+            if self.resolution is not None:
+                img = transforms.Resize(self.resolution, interpolation=transforms.InterpolationMode.BICUBIC)(img)
+            # write image
+            os.makedirs(os.path.dirname(image_path), exist_ok=True)
+            img.save(image_path)
+        return img
+    def _load_caption(self, id: int):
+        caption_path = f"{self.caption_folder_path}/sa_{id}.txt"
+        if not os.path.exists(caption_path):
+            return None
+        try:
+            with open(caption_path, 'r', encoding="utf-8") as f:
+                content = f.read()
+        except Exception as e:
+            raise e
+            print(f"Error reading caption file {caption_path}, error: {e}")
+            return None
+        sentences = content.split('.')
+        # remove empty sentences and sentences with "black and white"(too many false prediction)
+        sentences = [sentence.strip() for sentence in sentences if sentence.strip() and "black and white" not in sentence]
+        # join sentence
+        sentences = ". ".join(sentences)
+        if len(sentences) > 0 and sentences[-1] != '.':
+            sentences += '.'
+        return sentences
+    def with_transform(self, transform):
+        self.transforms = transform
+        return self
+    def subsample(self, n: int = 10000):
+        if n is None or n == -1:
+            return self
+        ori_len = len(self)
+        assert n <= ori_len
+        # equal interval subsample
+        ids = self.ids[::ori_len // n][:n]
+        self.ids = ids
+        print(f"SAM dataset subsampled from {ori_len} to {len(self)}")
+        return self
+if __name__ == "__main__":
+    # sam_filt(caption_filt=False, clip_filt=False, clip_logit=True)
+    from custom_datasets.sam_caption.mypath import MyPath
+    dataset = SamDataset(image_folder_path=MyPath.db_root_dir("sam_images"), caption_folder_path=MyPath.db_root_dir("sam_captions"), id_file=MyPath.db_root_dir("sam_whole_filtered_ids_train"), id_dict_file=MyPath.db_root_dir("sam_id_dict"))
+    dataset.get_img = False
+    for i in tqdm.tqdm(dataset):
+        a=i['text']

data/Art_adapters/albert-gleizes_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1802d12e4d9526eedb89d99f69051849f14774da3c73ebc9b1393c2b13f17022
+size 2187129

data/Art_adapters/andre-derain_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c39b39f32ff88dfed978ccc651715ade9edfd901d529adbeb5eedb715b8e159
+size 2187129

data/Art_adapters/andy_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd7764b19a2b4513b3c22f1607d72daa63c4ace97ea803e29e2bcf3f13bab2e8
+size 2187129

data/Art_adapters/camille-corot_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:426c2e4a3bfc26f7fdcc3e82989d717fa5fc6e732cd9df9f8bb293ab72cacfa5
+size 2187129

data/Art_adapters/gerhard-richter_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8be8ef590baceb2bdfac8b25976df88fa7baa1a9c718ed16aa4fa8fa247bb421
+size 2187129

data/Art_adapters/henri-matisse_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:212f0f16ae84c0bae96e213a0b0d5f4309209b332d48cbaa1748b5cdcfb3238a
+size 2187129

data/Art_adapters/jackson-pollock_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cff54e3e7c544577dbc39d7015a89c4786cd012cf944d0b9db334c1a1d7e30b
+size 2187129

data/Art_adapters/joan-miro_subset2/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c26bdb5bfba85b4eb00631eda149912ba557935773842f95c0596999f799a2b4
+size 2187129

data/Art_adapters/kandinsky_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24b33205841d9b09c0076b4ba295be29d94677e69b7269465897bbf059a40454
+size 2187129

data/Art_adapters/katsushika-hokusai_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b34b75325c3fd0353b55f390027a32a98f771df7d2fb21dbd8bce81a12ba59e9
+size 2187129

data/Art_adapters/klimt_subset3/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7457f14af7c77f98675063582b35317963d46e942459575d38b5996ed190c58f
+size 2187129

data/Art_adapters/m.c.-escher_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6df86764f4d4ceec0bd6124a74a51c36665c8491511a5488737b9a64300b97b
+size 2187129

data/Art_adapters/monet_subset2/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a9ba0305edca3286258a06023b97914b850fbc8b4f5a14769537f9a01ef33f1
+size 2187129

data/Art_adapters/picasso_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ce7899c19b32dacd2dc46090fd3429495a2230c173bcd96149236d27b5151fd
+size 2187129

data/Art_adapters/roy-lichtenstein_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ac428a5d0fb136b79eec2349fbcbd99dfac2315c0a7f54d7985299b60b6f66f
+size 2187129

data/Art_adapters/van_gogh_subset1/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ca866dd868fb89a1180bb140dfaf1e48701993c8fa173d70c56c60c9af8d8fb
+size 2187129

data/Art_adapters/walter-battiss_subset2/adapter_alpha1.0_rank1_all_up_1000steps.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41cad39d7b6e1873cfef85be478851820f5dc80cd7ce11afe2bfa3584662e3ac
+size 2187129

data/unsafe.png ADDED Viewed

hf_demo.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# Authors: Hui Ren (rhfeiyang.github.io)
+import os
+import gradio as gr
+from diffusers import DiffusionPipeline
+import matplotlib.pyplot as plt
+import torch
+from PIL import Image
+device = "cuda" if torch.cuda.is_available() else "cpu"
+pipe = DiffusionPipeline.from_pretrained("rhfeiyang/art-free-diffusion-v1",).to(device)
+from inference import get_lora_network, inference, get_validation_dataloader
+lora_map = {
+    "None": "None",
+    "Andre Derain": "andre-derain_subset1",
+    "Vincent van Gogh": "van_gogh_subset1",
+    "Andy Warhol": "andy_subset1",
+    "Walter Battiss": "walter-battiss_subset2",
+    "Camille Corot": "camille-corot_subset1",
+    "Claude Monet": "monet_subset2",
+    "Pablo Picasso": "picasso_subset1",
+    "Jackson Pollock": "jackson-pollock_subset1",
+    "Gerhard Richter": "gerhard-richter_subset1",
+    "M.C. Escher": "m.c.-escher_subset1",
+    "Albert Gleizes": "albert-gleizes_subset1",
+    "Hokusai": "katsushika-hokusai_subset1",
+    "Wassily Kandinsky": "kandinsky_subset1",
+    "Gustav Klimt": "klimt_subset3",
+    "Roy Lichtenstein": "roy-lichtenstein_subset1",
+    "Henri Matisse": "henri-matisse_subset1",
+    "Joan Miro": "joan-miro_subset2",
+}
+def demo_inference_gen(adapter_choice:str, prompt:str, samples:int=1,seed:int=0, steps=50, guidance_scale=7.5):
+    adapter_path = lora_map[adapter_choice]
+    if adapter_path not in [None, "None"]:
+        adapter_path = f"data/Art_adapters/{adapter_path}/adapter_alpha1.0_rank1_all_up_1000steps.pt"
+    prompts = [prompt]*samples
+    infer_loader = get_validation_dataloader(prompts)
+    network = get_lora_network(pipe.unet, adapter_path)["network"]
+    pred_images = inference(network, pipe.tokenizer, pipe.text_encoder, pipe.vae, pipe.unet, pipe.scheduler, infer_loader,
+                            height=512, width=512, scales=[1.0],
+                            save_dir=None, seed=seed,steps=steps, guidance_scale=guidance_scale,
+                            start_noise=-1, show=False, style_prompt="sks art", no_load=True,
+                            from_scratch=True)[0][1.0]
+    return pred_images
+def demo_inference_stylization(adapter_path:str, prompts:list, image:list, start_noise=800,seed:int=0):
+    infer_loader = get_validation_dataloader(prompts, image)
+    network = get_lora_network(pipe.unet, adapter_path,"all_up")["network"]
+    pred_images = inference(network, pipe.tokenizer, pipe.text_encoder, pipe.vae, pipe.unet, pipe.scheduler, infer_loader,
+                            height=512, width=512, scales=[0.,1.],
+                            save_dir=None, seed=seed,steps=20, guidance_scale=7.5,
+                            start_noise=start_noise, show=True, style_prompt="sks art", no_load=True,
+                            from_scratch=False)
+    return pred_images
+# def infer(prompt, samples, steps, scale, seed):
+#     generator = torch.Generator(device=device).manual_seed(seed)
+#     images_list = pipe(  # type: ignore
+#         [prompt] * samples,
+#         num_inference_steps=steps,
+#         guidance_scale=scale,
+#         generator=generator,
+#     )
+#     images = []
+#     safe_image = Image.open(r"data/unsafe.png")
+#     print(images_list)
+#     for i, image in enumerate(images_list["images"]):  # type: ignore
+#         if images_list["nsfw_content_detected"][i]:  # type: ignore
+#             images.append(safe_image)
+#         else:
+#             images.append(image)
+#     return images
+block = gr.Blocks()
+# Direct infer
+with block:
+    with gr.Group():
+        with gr.Row():
+            text = gr.Textbox(
+                label="Enter your prompt",
+                max_lines=2,
+                placeholder="Enter your prompt",
+                container=False,
+                value="Park with cherry blossom trees, picnicker’s and a clear blue pond.",
+            )
+            btn = gr.Button("Run", scale=0)
+        gallery = gr.Gallery(
+            label="Generated images",
+            show_label=False,
+            elem_id="gallery",
+            columns=[2],
+        )
+        advanced_button = gr.Button("Advanced options", elem_id="advanced-btn")
+        with gr.Row(elem_id="advanced-options"):
+            adapter_choice = gr.Dropdown(
+                label="Choose adapter",
+                choices=["None", "Andre Derain","Vincent van Gogh","Andy Warhol", "Walter Battiss",
+                         "Camille Corot", "Claude Monet", "Pablo Picasso",
+                         "Jackson Pollock", "Gerhard Richter", "M.C. Escher",
+                         "Albert Gleizes", "Hokusai", "Wassily Kandinsky", "Gustav Klimt", "Roy Lichtenstein",
+                         "Henri Matisse", "Joan Miro"
+                         ],
+                value="None"
+            )
+            # print(adapter_choice[0])
+            # lora_path = lora_map[adapter_choice.value]
+            # if lora_path is not None:
+            #     lora_path = f"data/Art_adapters/{lora_path}/adapter_alpha1.0_rank1_all_up_1000steps.pt"
+            samples = gr.Slider(label="Images", minimum=1, maximum=4, value=1, step=1)
+            steps = gr.Slider(label="Steps", minimum=1, maximum=50, value=20, step=1)
+            scale = gr.Slider(
+                label="Guidance Scale", minimum=0, maximum=50, value=7.5, step=0.1
+            )
+            print(scale)
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=2147483647,
+                step=1,
+                randomize=True,
+            )
+        gr.on([text.submit, btn.click], demo_inference_gen, inputs=[adapter_choice, text, samples, seed, steps, scale], outputs=gallery)
+        advanced_button.click(
+            None,
+            [],
+            text,
+        )
+block.launch()

hf_demo_test.ipynb ADDED Viewed

	@@ -0,0 +1,336 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-09T09:44:30.641366Z",
+     "start_time": "2024-12-09T09:44:11.789050Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import gradio as gr\n",
+    "from diffusers import DiffusionPipeline\n",
+    "import matplotlib.pyplot as plt\n",
+    "import torch\n",
+    "from PIL import Image\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ddf33e0d3abacc2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "#append current path\n",
+    "sys.path.extend(\"/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/release/hf_demo\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "643e49fd601daf8f",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-09T09:44:35.790962Z",
+     "start_time": "2024-12-09T09:44:35.779496Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e03aae2a4e5676dd",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-09T09:44:44.157412Z",
+     "start_time": "2024-12-09T09:44:37.138452Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/vision/torralba/selfmanaged/torralba/scratch/jomat/sam_dataset/miniforge3/envs/diffusion/lib/python3.9/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9df8347307674ba8afb0250e23109aa1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "pipe = DiffusionPipeline.from_pretrained(\"rhfeiyang/art-free-diffusion-v1\",).to(\"cuda\")\n",
+    "device = \"cuda\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "83916bc68ff5d914",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-09T09:44:52.694399Z",
+     "start_time": "2024-12-09T09:44:44.210695Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from inference import get_lora_network, inference, get_validation_dataloader\n",
+    "lora_map = {\n",
+    "    \"None\": \"None\",\n",
+    "    \"Andre Derain\": \"andre-derain_subset1\",\n",
+    "    \"Vincent van Gogh\": \"van_gogh_subset1\",\n",
+    "    \"Andy Warhol\": \"andy_subset1\",\n",
+    "    \"Walter Battiss\": \"walter-battiss_subset2\",\n",
+    "    \"Camille Corot\": \"camille-corot_subset1\",\n",
+    "    \"Claude Monet\": \"monet_subset2\",\n",
+    "    \"Pablo Picasso\": \"picasso_subset1\",\n",
+    "    \"Jackson Pollock\": \"jackson-pollock_subset1\",\n",
+    "    \"Gerhard Richter\": \"gerhard-richter_subset1\",\n",
+    "    \"M.C. Escher\": \"m.c.-escher_subset1\",\n",
+    "    \"Albert Gleizes\": \"albert-gleizes_subset1\",\n",
+    "    \"Hokusai\": \"katsushika-hokusai_subset1\",\n",
+    "    \"Wassily Kandinsky\": \"kandinsky_subset1\",\n",
+    "    \"Gustav Klimt\": \"klimt_subset3\",\n",
+    "    \"Roy Lichtenstein\": \"roy-lichtenstein_subset1\",\n",
+    "    \"Henri Matisse\": \"henri-matisse_subset1\",\n",
+    "    \"Joan Miro\": \"joan-miro_subset2\",\n",
+    "}\n",
+    "\n",
+    "def demo_inference_gen(adapter_choice:str, prompt:str, samples:int=1,seed:int=0, steps=50, guidance_scale=7.5):\n",
+    "    adapter_path = lora_map[adapter_choice]\n",
+    "    if adapter_path not in [None, \"None\"]:\n",
+    "        adapter_path = f\"data/Art_adapters/{adapter_path}/adapter_alpha1.0_rank1_all_up_1000steps.pt\"\n",
+    "\n",
+    "    prompts = [prompt]*samples\n",
+    "    infer_loader = get_validation_dataloader(prompts)\n",
+    "    network = get_lora_network(pipe.unet, adapter_path)[\"network\"]\n",
+    "    pred_images = inference(network, pipe.tokenizer, pipe.text_encoder, pipe.vae, pipe.unet, pipe.scheduler, infer_loader,\n",
+    "                            height=512, width=512, scales=[1.0],\n",
+    "                            save_dir=None, seed=seed,steps=steps, guidance_scale=guidance_scale,\n",
+    "                            start_noise=-1, show=False, style_prompt=\"sks art\", no_load=True,\n",
+    "                            from_scratch=True)[0][1.0]\n",
+    "    return pred_images\n",
+    "\n",
+    "def demo_inference_stylization(adapter_path:str, prompts:list, image:list, start_noise=800,seed:int=0):\n",
+    "    infer_loader = get_validation_dataloader(prompts, image)\n",
+    "    network = get_lora_network(pipe.unet, adapter_path,\"all_up\")[\"network\"]\n",
+    "    pred_images = inference(network, pipe.tokenizer, pipe.text_encoder, pipe.vae, pipe.unet, pipe.scheduler, infer_loader,\n",
+    "                            height=512, width=512, scales=[0.,1.],\n",
+    "                            save_dir=None, seed=seed,steps=20, guidance_scale=7.5,\n",
+    "                            start_noise=start_noise, show=True, style_prompt=\"sks art\", no_load=True,\n",
+    "                            from_scratch=False)\n",
+    "    return pred_images\n",
+    "\n",
+    "# def infer(prompt, samples, steps, scale, seed):\n",
+    "#     generator = torch.Generator(device=device).manual_seed(seed)\n",
+    "#     images_list = pipe(  # type: ignore\n",
+    "#         [prompt] * samples,\n",
+    "#         num_inference_steps=steps,\n",
+    "#         guidance_scale=scale,\n",
+    "#         generator=generator,\n",
+    "#     )\n",
+    "#     images = []\n",
+    "#     safe_image = Image.open(r\"data/unsafe.png\")\n",
+    "#     print(images_list)\n",
+    "#     for i, image in enumerate(images_list[\"images\"]):  # type: ignore\n",
+    "#         if images_list[\"nsfw_content_detected\"][i]:  # type: ignore\n",
+    "#             images.append(safe_image)\n",
+    "#         else:\n",
+    "#             images.append(image)\n",
+    "#     return images\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "aa33e9d104023847",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-12-09T12:09:39.339583Z",
+     "start_time": "2024-12-09T12:09:38.953936Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<gradio.components.slider.Slider object at 0x7fa12d3a5280>\n",
+      "Running on local URL:  http://127.0.0.1:7876\n",
+      "Running on public URL: https://be7cce8fec75395c82.gradio.live\n",
+      "\n",
+      "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"https://be7cce8fec75395c82.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train method: None\n",
+      "Rank: 1, Alpha: 1\n",
+      "create LoRA for U-Net: 0 modules.\n",
+      "save dir: None\n",
+      "['Park with cherry blossom trees, picnicker’s and a clear blue pond in the style of sks art'], seed=949192390\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/data/vision/torralba/selfmanaged/torralba/scratch/jomat/sam_dataset/miniforge3/envs/diffusion/lib/python3.9/site-packages/torch/nn/modules/conv.py:456: UserWarning: Plan failed with a cudnnException: CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR: cudnnFinalize Descriptor Failed cudnn_status: CUDNN_STATUS_NOT_SUPPORTED (Triggered internally at /opt/conda/conda-bld/pytorch_1712608883701/work/aten/src/ATen/native/cudnn/Conv_v8.cpp:919.)\n",
+      "  return F.conv2d(input, weight, bias, self.stride,\n",
+      "\n",
+      "00%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:03<00:00,  6.90it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Time taken for one batch, Art Adapter scale=1.0: 3.2747044563293457\n"
+     ]
+    }
+   ],
+   "source": [
+    "block = gr.Blocks()\n",
+    "# Direct infer\n",
+    "with block:\n",
+    "    with gr.Group():\n",
+    "        with gr.Row():\n",
+    "            text = gr.Textbox(\n",
+    "                label=\"Enter your prompt\",\n",
+    "                max_lines=2,\n",
+    "                placeholder=\"Enter your prompt\",\n",
+    "                container=False,\n",
+    "                value=\"Park with cherry blossom trees, picnicker’s and a clear blue pond.\",\n",
+    "            )\n",
+    "            \n",
+    "\n",
+    "            \n",
+    "            btn = gr.Button(\"Run\", scale=0)\n",
+    "        gallery = gr.Gallery(\n",
+    "            label=\"Generated images\",\n",
+    "            show_label=False,\n",
+    "            elem_id=\"gallery\",\n",
+    "            columns=[2],\n",
+    "        )\n",
+    "\n",
+    "        advanced_button = gr.Button(\"Advanced options\", elem_id=\"advanced-btn\")\n",
+    "\n",
+    "        with gr.Row(elem_id=\"advanced-options\"):\n",
+    "            adapter_choice = gr.Dropdown(\n",
+    "                label=\"Choose adapter\",\n",
+    "                choices=[\"None\", \"Andre Derain\",\"Vincent van Gogh\",\"Andy Warhol\", \"Walter Battiss\",\n",
+    "                         \"Camille Corot\", \"Claude Monet\", \"Pablo Picasso\",\n",
+    "                         \"Jackson Pollock\", \"Gerhard Richter\", \"M.C. Escher\",\n",
+    "                         \"Albert Gleizes\", \"Hokusai\", \"Wassily Kandinsky\", \"Gustav Klimt\", \"Roy Lichtenstein\",\n",
+    "                         \"Henri Matisse\", \"Joan Miro\"\n",
+    "                         ],\n",
+    "                value=\"None\"\n",
+    "            )\n",
+    "            # print(adapter_choice[0])\n",
+    "            # lora_path = lora_map[adapter_choice.value]\n",
+    "            # if lora_path is not None:\n",
+    "            #     lora_path = f\"data/Art_adapters/{lora_path}/adapter_alpha1.0_rank1_all_up_1000steps.pt\"\n",
+    "\n",
+    "            samples = gr.Slider(label=\"Images\", minimum=1, maximum=4, value=1, step=1)\n",
+    "            steps = gr.Slider(label=\"Steps\", minimum=1, maximum=50, value=20, step=1)\n",
+    "            scale = gr.Slider(\n",
+    "                label=\"Guidance Scale\", minimum=0, maximum=50, value=7.5, step=0.1\n",
+    "            )\n",
+    "            print(scale)\n",
+    "            seed = gr.Slider(\n",
+    "                label=\"Seed\",\n",
+    "                minimum=0,\n",
+    "                maximum=2147483647,\n",
+    "                step=1,\n",
+    "                randomize=True,\n",
+    "            )\n",
+    "\n",
+    "        gr.on([text.submit, btn.click], demo_inference_gen, inputs=[adapter_choice, text, samples, seed, steps, scale], outputs=gallery)\n",
+    "        advanced_button.click(\n",
+    "            None,\n",
+    "            [],\n",
+    "            text,\n",
+    "        )\n",
+    "\n",
+    "\n",
+    "block.launch(share=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3239c12167a5f2cd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

inference.py ADDED Viewed

	@@ -0,0 +1,657 @@

+# Authors: Hui Ren (rhfeiyang.github.io)
+import torch
+from PIL import Image
+import argparse
+import os, json, random
+import matplotlib.pyplot as plt
+import glob, re
+from tqdm import tqdm
+import numpy as np
+import sys
+import gc
+from transformers import CLIPTextModel, CLIPTokenizer, BertModel, BertTokenizer
+# import train_util
+from utils.train_util import get_noisy_image, encode_prompts
+from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel, LMSDiscreteScheduler, DDIMScheduler, PNDMScheduler
+from typing import Any, Dict, List, Optional, Tuple, Union
+from utils.lora import LoRANetwork, DEFAULT_TARGET_REPLACE, UNET_TARGET_REPLACE_MODULE_CONV
+import argparse
+# from diffusers.training_utils import EMAModel
+import shutil
+import yaml
+from easydict import EasyDict
+from utils.metrics import StyleContentMetric
+from torchvision import transforms
+from custom_datasets.coco import CustomCocoCaptions
+from custom_datasets.imagepair import ImageSet
+from custom_datasets import get_dataset
+# from stable_diffusion.utils.modules import get_diffusion_modules
+# from diffusers import StableDiffusionImg2ImgPipeline
+from diffusers.utils.torch_utils import randn_tensor
+import pickle
+import time
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+def get_train_method(lora_weight):
+    if lora_weight is None:
+        return 'None'
+    if 'full' in lora_weight:
+        train_method = 'full'
+    elif "down_1_up_2_attn" in lora_weight:
+        train_method = 'up_2_attn'
+        print(f"Using up_2_attn for {lora_weight}")
+    elif "down_2_up_1_up_2_attn" in lora_weight:
+        train_method = 'down_2_up_2_attn'
+    elif "down_2_up_2_attn" in lora_weight:
+        train_method = 'down_2_up_2_attn'
+    elif "down_2_attn" in lora_weight:
+        train_method = 'down_2_attn'
+    elif 'noxattn' in lora_weight:
+        train_method = 'noxattn'
+    elif "xattn" in lora_weight:
+        train_method = 'xattn'
+    elif  "attn" in lora_weight:
+        train_method = 'attn'
+    elif "all_up" in lora_weight:
+        train_method = 'all_up'
+    else:
+        train_method = 'None'
+    return train_method
+def get_validation_dataloader(infer_prompts:list[str]=None, infer_images :list[str]=None,resolution=512, batch_size=10, num_workers=4, val_set="laion_pop500"):
+    data_transforms = transforms.Compose(
+        [
+            transforms.Resize(resolution),
+            transforms.CenterCrop(resolution),
+        ]
+    )
+    def preprocess(example):
+        ret={}
+        ret["image"] = data_transforms(example["image"]) if "image" in example else None
+        if "caption" in example:
+            if isinstance(example["caption"][0], list):
+                ret["caption"] = example["caption"][0][0]
+            else:
+                ret["caption"] = example["caption"][0]
+        if "seed" in example:
+            ret["seed"] = example["seed"]
+        if "id" in example:
+            ret["id"] = example["id"]
+        if "path" in example:
+            ret["path"] = example["path"]
+        return ret
+    def collate_fn(examples):
+        out = {}
+        if "image" in examples[0]:
+            pixel_values = [example["image"] for example in examples]
+            out["pixel_values"] = pixel_values
+        # notice: only take the first prompt for each image
+        if "caption" in examples[0]:
+            prompts = [example["caption"] for example in examples]
+            out["prompts"] = prompts
+        if "seed" in examples[0]:
+            seeds = [example["seed"] for example in examples]
+            out["seed"] = seeds
+        if "path" in examples[0]:
+            paths = [example["path"] for example in examples]
+            out["path"] = paths
+        return out
+    if infer_prompts is None:
+        if val_set == "lhq500":
+            dataset = get_dataset("lhq_sub500", get_val=False)["train"]
+        elif val_set == "custom_coco100":
+            dataset = get_dataset("custom_coco100", get_val=False)["train"]
+        elif val_set == "custom_coco500":
+            dataset = get_dataset("custom_coco500", get_val=False)["train"]
+        elif os.path.isdir(val_set):
+            image_folder = os.path.join(val_set, "paintings")
+            caption_folder = os.path.join(val_set, "captions")
+            dataset = ImageSet(folder=image_folder, caption=caption_folder, keep_in_mem=True)
+        elif "custom_caption" in val_set:
+            from custom_datasets.custom_caption import Caption_set
+            name = val_set.replace("custom_caption_", "")
+            dataset = Caption_set(set_name = name)
+        elif val_set == "laion_pop500":
+            dataset = get_dataset("laion_pop500", get_val=False)["train"]
+        elif val_set == "laion_pop500_first_sentence":
+            dataset = get_dataset("laion_pop500_first_sentence", get_val=False)["train"]
+        else:
+            raise ValueError("Unknown dataset")
+        dataset.with_transform(preprocess)
+    elif isinstance(infer_prompts, torch.utils.data.Dataset):
+        dataset = infer_prompts
+        try:
+            dataset.with_transform(preprocess)
+        except:
+            pass
+    else:
+        class Dataset(torch.utils.data.Dataset):
+            def __init__(self, prompts, images=None):
+                self.prompts = prompts
+                self.images = images
+                self.get_img = False
+                if images is not None:
+                    assert len(prompts) == len(images)
+                    self.get_img = True
+                    if isinstance(images[0], str):
+                        self.images = [Image.open(image).convert("RGB") for image in images]
+                else:
+                    self.images = [None] * len(prompts)
+            def __len__(self):
+                return len(self.prompts)
+            def __getitem__(self, idx):
+                img = self.images[idx]
+                if self.get_img and img is not None:
+                    img = data_transforms(img)
+                return {"caption": self.prompts[idx], "image":img}
+        dataset = Dataset(infer_prompts, infer_images)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, drop_last=False,
+                                             num_workers=num_workers, pin_memory=True)
+    return dataloader
+def get_lora_network(unet , lora_path, train_method="None", rank=1, alpha=1.0, device="cuda", weight_dtype=torch.float32):
+    if train_method in [None, "None"]:
+        train_method = get_train_method(lora_path)
+        print(f"Train method: {train_method}")
+    network_type = "c3lier"
+    if train_method == 'xattn':
+        network_type = 'lierla'
+    modules = DEFAULT_TARGET_REPLACE
+    if network_type == "c3lier":
+        modules += UNET_TARGET_REPLACE_MODULE_CONV
+    alpha = 1
+    if "rank" in lora_path:
+        rank = int(re.search(r'rank(\d+)', lora_path).group(1))
+    if 'alpha1' in lora_path:
+        alpha = 1.0
+    print(f"Rank: {rank}, Alpha: {alpha}")
+    network = LoRANetwork(
+        unet,
+        rank=rank,
+        multiplier=1.0,
+        alpha=alpha,
+        train_method=train_method,
+    ).to(device, dtype=weight_dtype)
+    if lora_path not in [None, "None"]:
+        lora_state_dict = torch.load(lora_path)
+        miss = network.load_state_dict(lora_state_dict, strict=False)
+        print(f"Missing: {miss}")
+    ret = {"network": network, "train_method": train_method}
+    return ret
+def get_model(pretrained_ckpt_path, unet_ckpt=None,revision=None, variant=None, lora_path=None, weight_dtype=torch.float32,
+              device="cuda"):
+    modules = {}
+    pipe = DiffusionPipeline.from_pretrained(pretrained_ckpt_path, revision=revision, variant=variant)
+    if unet_ckpt is not None:
+        pipe.unet.from_pretrained(unet_ckpt, subfolder="unet_ema", revision=revision, variant=variant)
+    unet = pipe.unet
+    vae = pipe.vae
+    text_encoder = pipe.text_encoder
+    tokenizer = pipe.tokenizer
+    modules["unet"] = unet
+    modules["vae"] = vae
+    modules["text_encoder"] = text_encoder
+    modules["tokenizer"] = tokenizer
+    # tokenizer = modules["tokenizer"]
+    unet.enable_xformers_memory_efficient_attention()
+    unet.to(device, dtype=weight_dtype)
+    if weight_dtype != torch.bfloat16:
+        vae.to(device, dtype=torch.float32)
+    else:
+        vae.to(device, dtype=weight_dtype)
+    text_encoder.to(device, dtype=weight_dtype)
+    if lora_path is not None:
+        network = get_lora_network(unet, lora_path, device=device, weight_dtype=weight_dtype)
+        modules["network"] = network
+    return modules
+@torch.no_grad()
+def inference(network: LoRANetwork, tokenizer: CLIPTokenizer, text_encoder: CLIPTextModel, vae: AutoencoderKL, unet: UNet2DConditionModel, noise_scheduler: LMSDiscreteScheduler,
+              dataloader, height:int, width:int, scales:list = np.linspace(0,2,5),save_dir:str=None, seed:int = None,
+              weight_dtype: torch.dtype = torch.float32, device: torch.device="cuda", batch_size:int=1, steps:int=50, guidance_scale:float=7.5, start_noise:int=800,
+              uncond_prompt:str=None, uncond_embed=None, style_prompt = None, show:bool = False, no_load:bool=False, from_scratch=False):
+    print(f"save dir: {save_dir}")
+    if start_noise < 0:
+        assert from_scratch
+    network = network.eval()
+    unet = unet.eval()
+    vae = vae.eval()
+    do_convert = not from_scratch
+    if not do_convert:
+        try:
+            dataloader.dataset.get_img = False
+        except:
+            pass
+        scales = list(scales)
+    else:
+        scales = ["Real Image"] + list(scales)
+    if not no_load and os.path.exists(os.path.join(save_dir, "infer_imgs.pickle")):
+        with open(os.path.join(save_dir, "infer_imgs.pickle"), 'rb') as f:
+            pred_images = pickle.load(f)
+        take=True
+        for key in scales:
+            if key not in pred_images:
+                take=False
+                break
+        if take:
+            print(f"Found existing inference results in {save_dir}", flush=True)
+            return pred_images
+    max_length = tokenizer.model_max_length
+    pred_images = {scale :[] for scale in scales}
+    all_seeds = {scale:[] for scale in scales}
+    prompts = []
+    ori_prompts = []
+    if save_dir is not None:
+        img_output_dir = os.path.join(save_dir, "outputs")
+        os.makedirs(img_output_dir, exist_ok=True)
+    if uncond_embed is None:
+        if uncond_prompt is None:
+            uncond_input_text = [""]
+        else:
+            uncond_input_text = [uncond_prompt]
+        uncond_embed = encode_prompts(tokenizer = tokenizer, text_encoder = text_encoder, prompts = uncond_input_text)
+    for batch in dataloader:
+        ori_prompt = batch["prompts"]
+        image = batch["pixel_values"] if do_convert else None
+        if do_convert:
+            pred_images["Real Image"] += image
+        if isinstance(ori_prompt, list):
+            if isinstance(text_encoder, CLIPTextModel):
+                # trunc prompts for clip encoder
+                ori_prompt = [p.split(".")[0]+"." for p in ori_prompt]
+            prompt = [f"{p.strip()[::-1].replace('.', '',1)[::-1]} in the style of {style_prompt}" for p in ori_prompt] if style_prompt is not None else ori_prompt
+        else:
+            if isinstance(text_encoder, CLIPTextModel):
+                ori_prompt = ori_prompt.split(".")[0]+"."
+            prompt = f"{prompt.strip()[::-1].replace('.', '',1)[::-1]} in the style of {style_prompt}" if style_prompt is not None else ori_prompt
+        bcz = len(prompt)
+        single_seed = seed
+        if dataloader.batch_size == 1 and seed is None:
+            if "seed" in batch:
+                single_seed = batch["seed"][0]
+        print(f"{prompt}, seed={single_seed}")
+        # text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt").to(device)
+        # original_embeddings = text_encoder(**text_input)[0]
+        prompts += prompt
+        ori_prompts += ori_prompt
+        # style_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt").to(device)
+        # # style_embeddings = text_encoder(**style_input)[0]
+        # style_embeddings = text_encoder(style_input.input_ids, return_dict=False)[0]
+        style_embeddings = encode_prompts(tokenizer = tokenizer, text_encoder = text_encoder, prompts = prompt)
+        original_embeddings = encode_prompts(tokenizer = tokenizer, text_encoder = text_encoder, prompts = ori_prompt)
+        if uncond_embed.shape[0] == 1 and bcz > 1:
+            uncond_embeddings = uncond_embed.repeat(bcz, 1, 1)
+        else:
+            uncond_embeddings = uncond_embed
+        style_text_embeddings = torch.cat([uncond_embeddings, style_embeddings])
+        original_embeddings = torch.cat([uncond_embeddings, original_embeddings])
+        generator = torch.manual_seed(single_seed) if single_seed is not None else None
+        noise_scheduler.set_timesteps(steps)
+        if do_convert:
+            noised_latent, _, _ = get_noisy_image(image, vae, generator, unet, noise_scheduler, total_timesteps=int((1000-start_noise)/1000 *steps))
+        else:
+            latent_shape =  (bcz, 4, height//8, width//8)
+            noised_latent = randn_tensor(latent_shape, generator=generator, device=vae.device)
+        noised_latent = noised_latent.to(unet.dtype)
+        noised_latent = noised_latent * noise_scheduler.init_noise_sigma
+        for scale in scales:
+            start_time = time.time()
+            if not isinstance(scale, float) and not isinstance(scale, int):
+                continue
+            latents = noised_latent.clone().to(weight_dtype).to(device)
+            noise_scheduler.set_timesteps(steps)
+            for t in tqdm(noise_scheduler.timesteps):
+                if do_convert and t>start_noise:
+                    continue
+                else:
+                    if t > start_noise and start_noise >= 0:
+                        current_scale = 0
+                    else:
+                        current_scale = scale
+                network.set_lora_slider(scale=current_scale)
+                text_embedding = style_text_embeddings
+                # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+                latent_model_input = torch.cat([latents] * 2)
+                latent_model_input = noise_scheduler.scale_model_input(latent_model_input, timestep=t)
+                # predict the noise residual
+                with network:
+                    noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embedding).sample
+                # perform guidance
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                if isinstance(noise_scheduler, DDPMScheduler):
+                    latents = noise_scheduler.step(noise_pred, t, latents, generator=torch.manual_seed(single_seed+t) if single_seed is not None else None).prev_sample
+                else:
+                    latents = noise_scheduler.step(noise_pred, t, latents).prev_sample
+            # scale and decode the image latents with vae
+            latents = 1 / 0.18215 * latents.to(vae.dtype)
+            with torch.no_grad():
+                image = vae.decode(latents).sample
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+            images = (image * 255).round().astype("uint8")
+            pil_images = [Image.fromarray(image) for image in images]
+            pred_images[scale]+=pil_images
+            all_seeds[scale] += [single_seed] * bcz
+            end_time = time.time()
+            print(f"Time taken for one batch, Art Adapter scale={scale}: {end_time-start_time}", flush=True)
+        if save_dir is not None or show:
+            end_idx = len(list(pred_images.values())[0])
+            for i in range(end_idx-bcz, end_idx):
+                keys = list(pred_images.keys())
+                images_list = [pred_images[key][i] for key in keys]
+                prompt = prompts[i]
+                if len(scales)==1:
+                    plt.imshow(images_list[0])
+                    plt.axis('off')
+                    plt.title(f"{prompt}_{single_seed}_start{start_noise}", fontsize=20)
+                else:
+                    fig, ax = plt.subplots(1, len(images_list), figsize=(len(scales)*5,6), layout="constrained")
+                    for id, a in enumerate(ax):
+                        a.imshow(images_list[id])
+                        if isinstance(scales[id], float) or isinstance(scales[id], int):
+                            a.set_title(f"Art Adapter scale={scales[id]}", fontsize=20)
+                        else:
+                            a.set_title(f"{keys[id]}", fontsize=20)
+                        a.axis('off')
+                    # plt.suptitle(f"{os.path.basename(lora_weight).replace('.pt','')}", fontsize=20)
+                    # plt.tight_layout()
+                    # if do_convert:
+                    #     plt.suptitle(f"{prompt}\nseed{single_seed}_start{start_noise}_guidance{guidance_scale}", fontsize=20)
+                    # else:
+                    #     plt.suptitle(f"{prompt}\nseed{single_seed}_from_scratch_guidance{guidance_scale}", fontsize=20)
+                if save_dir is not None:
+                    plt.savefig(f"{img_output_dir}/{prompt.replace(' ', '_')[:100]}_seed{single_seed}_start{start_noise}.png")
+                if show:
+                    plt.show()
+                plt.close()
+        flush()
+    if save_dir is not None:
+        with open(os.path.join(save_dir, "infer_imgs.pickle" ), 'wb') as f:
+            pickle.dump(pred_images, f)
+        with open(os.path.join(save_dir, "all_seeds.pickle"), 'wb') as f:
+            to_save={"all_seeds":all_seeds, "batch_size":batch_size}
+            pickle.dump(to_save, f)
+        for scale, images in pred_images.items():
+            subfolder = os.path.join(save_dir,"images", f"{scale}")
+            os.makedirs(subfolder, exist_ok=True)
+            used_prompt = ori_prompts
+            if (isinstance(scale, float) or isinstance(scale, int)): #and scale != 0:
+                used_prompt = prompts
+            for i, image in enumerate(images):
+                if scale == "Real Image":
+                    suffix = ""
+                else:
+                    suffix = f"_seed{all_seeds[scale][i]}"
+                image.save(os.path.join(subfolder, f"{used_prompt[i].replace(' ', '_')[:100]}{suffix}.jpg"))
+        with open(os.path.join(save_dir, "infer_prompts.txt"), 'w') as f:
+            for prompt in prompts:
+                f.write(f"{prompt}\n")
+        with open(os.path.join(save_dir, "ori_prompts.txt"), 'w') as f:
+            for prompt in ori_prompts:
+                f.write(f"{prompt}\n")
+        print(f"Saved inference results to {save_dir}", flush=True)
+    return pred_images, prompts
+@torch.no_grad()
+def infer_metric(ref_image_folder,pred_images, prompts, save_dir, start_noise=""):
+    prompts = [prompt.split(" in the style of ")[0] for prompt in prompts]
+    scores = {}
+    original_images = pred_images["Real Image"] if "Real Image" in pred_images else None
+    metric = StyleContentMetric(ref_image_folder)
+    for scale, images in pred_images.items():
+        score = metric(images, original_images, prompts)
+        scores[scale] = score
+        print(f"Style transfer score at scale {scale}: {score}")
+    scores["ref_path"] = ref_image_folder
+    save_name = f"scores_start{start_noise}.json"
+    os.makedirs(save_dir, exist_ok=True)
+    with open(os.path.join(save_dir, save_name), 'w') as f:
+        json.dump(scores, f, indent=2)
+    return scores
+def parse_args():
+    parser = argparse.ArgumentParser(description='Inference with LoRA')
+    parser.add_argument('--lora_weights', type=str, default=["None"],
+                        nargs='+', help='path to your model file')
+    parser.add_argument('--prompts', type=str, default=[],
+                        nargs='+', help='prompts to try')
+    parser.add_argument("--prompt_file", type=str, default=None, help="path to the prompt file")
+    parser.add_argument("--prompt_file_key", type=str, default="prompts", help="key to the prompt file")
+    parser.add_argument('--resolution', type=int, default=512, help='resolution of the image')
+    parser.add_argument('--seed', type=int, default=None, help='seed for the random number generator')
+    parser.add_argument("--start_noise", type=int, default=800, help="start noise")
+    parser.add_argument("--from_scratch", default=False, action="store_true", help="from scratch")
+    parser.add_argument("--ref_image_folder", type=str, default=None, help="folder containing reference images")
+    parser.add_argument("--show", action="store_true", help="show the image")
+    parser.add_argument("--batch_size", type=int, default=1, help="batch size")
+    parser.add_argument("--scales", type=float, default=[0.,1.], nargs='+', help="scales to test")
+    parser.add_argument("--train_method", type=str, default=None, help="train method")
+    # parser.add_argument("--vae_path", type=str, default="CompVis/stable-diffusion-v1-4", help="Path to the VAE model.")
+    # parser.add_argument("--text_encoder_path", type=str, default="CompVis/stable-diffusion-v1-4", help="Path to the text encoder model.")
+    parser.add_argument("--pretrained_model_name_or_path", type=str, default="rhfeiyang/art-free-diffusion-v1", help="Path to the pretrained model.")
+    parser.add_argument("--unet_ckpt", default=None, type=str, help="Path to the unet checkpoint")
+    parser.add_argument("--guidance_scale", type=float, default=5.0, help="guidance scale")
+    parser.add_argument("--infer_mode", default="sks_art",  help="inference mode") #, choices=["style", "ori", "artist", "sks_art","Peter"]
+    parser.add_argument("--save_dir", type=str, default="inference_output", help="save directory")
+    parser.add_argument("--num_workers", type=int, default=4, help="number of workers")
+    parser.add_argument("--no_load", action="store_true", help="no load the pre-inferred results")
+    parser.add_argument("--infer_prompts", type=str, default=None, nargs="+", help="prompts to infer")
+    parser.add_argument("--infer_images", type=str, default=None, nargs="+", help="images to infer")
+    parser.add_argument("--rank", type=int, default=1, help="rank of the lora")
+    parser.add_argument("--val_set", type=str, default="laion_pop500",  help="validation set")
+    parser.add_argument("--folder_name", type=str, default=None, help="folder name")
+    parser.add_argument("--scheduler_type",type=str, choices=["ddpm", "ddim", "pndm","lms"], default="ddpm", help="scheduler type")
+    parser.add_argument("--infer_steps", type=int, default=50, help="inference steps")
+    parser.add_argument("--weight_dtype", type=str, default="fp32", help="weight dtype")
+    parser.add_argument("--custom_coco_cap", action="store_true", help="use custom coco caption")
+    args = parser.parse_args()
+    if args.infer_prompts is not None and len(args.infer_prompts) == 1 and os.path.isfile(args.infer_prompts[0]):
+        if args.infer_prompts[0].endswith(".txt") and args.custom_coco_cap:
+            args.infer_prompts = CustomCocoCaptions(custom_file=args.infer_prompts[0])
+        elif args.infer_prompts[0].endswith(".txt"):
+            with open(args.infer_prompts[0], 'r') as f:
+                args.infer_prompts = f.readlines()
+                args.infer_prompts = [prompt.strip() for prompt in args.infer_prompts]
+        elif args.infer_prompts[0].endswith(".csv"):
+            from custom_datasets.custom_caption import Caption_set
+            caption_set = Caption_set(args.infer_prompts[0])
+            args.infer_prompts = caption_set
+    if args.infer_mode == "style":
+        with open(os.path.join(args.ref_image_folder, "style_label.txt"), 'r') as f:
+            args.style_label = f.readlines()[0].strip()
+    elif args.infer_mode == "artist":
+        with open(os.path.join(args.ref_image_folder, "style_label.txt"), 'r') as f:
+            args.style_label = f.readlines()[0].strip()
+            args.style_label = args.style_label.split(",")[0].strip()
+    elif args.infer_mode == "ori":
+        args.style_label = None
+    else:
+        args.style_label = args.infer_mode.replace("_", " ")
+    if args.ref_image_folder is not None:
+        args.ref_image_folder = os.path.join(args.ref_image_folder, "paintings")
+    if args.start_noise < 0:
+        args.from_scratch = True
+    print(args.__dict__)
+    return args
+def main(args):
+    lora_weights = args.lora_weights
+    if len(lora_weights) == 1 and isinstance(lora_weights[0], str) and os.path.isdir(lora_weights[0]):
+        lora_weights = glob.glob(os.path.join(lora_weights[0], "*.pt"))
+        lora_weights=sorted(lora_weights, reverse=True)
+    width = args.resolution
+    height = args.resolution
+    steps = args.infer_steps
+    revision = None
+    device = 'cuda'
+    rank = args.rank
+    if args.weight_dtype == "fp32":
+        weight_dtype = torch.float32
+    elif args.weight_dtype=="fp16":
+        weight_dtype = torch.float16
+    elif args.weight_dtype=="bf16":
+        weight_dtype = torch.bfloat16
+    modules = get_model(args.pretrained_model_name_or_path, unet_ckpt=args.unet_ckpt, revision=revision, variant=None, lora_path=None, weight_dtype=weight_dtype, device=device, )
+    if args.scheduler_type == "pndm":
+        noise_scheduler = PNDMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+    elif args.scheduler_type == "ddpm":
+        noise_scheduler = DDPMScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")
+    elif args.scheduler_type == "ddim":
+        noise_scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            num_train_timesteps=1000,
+            clip_sample=False,
+            prediction_type="epsilon",
+        )
+    elif args.scheduler_type == "lms":
+        noise_scheduler = LMSDiscreteScheduler(beta_start=0.00085,
+                             beta_end=0.012,
+                             beta_schedule="scaled_linear",
+                             num_train_timesteps=1000)
+    else:
+        raise ValueError("Unknown scheduler type")
+    cache=EasyDict()
+    cache.modules = modules
+    unet = modules["unet"]
+    vae = modules["vae"]
+    text_encoder = modules["text_encoder"]
+    tokenizer = modules["tokenizer"]
+    unet.requires_grad_(False)
+    # Move unet, vae and text_encoder to device and cast to weight_dtype
+    vae.requires_grad_(False)
+    text_encoder.requires_grad_(False)
+    ## dataloader
+    dataloader = get_validation_dataloader(infer_prompts=args.infer_prompts, infer_images=args.infer_images,
+                                           resolution=args.resolution,
+                                           batch_size=args.batch_size, num_workers=args.num_workers,
+                                           val_set=args.val_set)
+    for lora_weight in lora_weights:
+        print(f"Testing {lora_weight}")
+        # for different seeds on same prompt
+        seed = args.seed
+        network_ret = get_lora_network(unet, lora_weight, train_method=args.train_method, rank=rank, alpha=1.0, device=device, weight_dtype=weight_dtype)
+        network = network_ret["network"]
+        train_method = network_ret["train_method"]
+        if args.save_dir is not None:
+            save_dir = args.save_dir
+            if args.style_label is not None:
+                save_dir = os.path.join(save_dir, f"{args.style_label.replace(' ', '_')}")
+            else:
+                save_dir = os.path.join(save_dir, f"ori/{args.start_noise}")
+        else:
+            if args.folder_name is not None:
+                folder_name = args.folder_name
+            else:
+                folder_name = "validation" if args.infer_prompts is None else "validation_prompts"
+            save_dir = os.path.join(os.path.dirname(lora_weight), f"{folder_name}/{train_method}", os.path.basename(lora_weight).replace('.pt','').split('_')[-1])
+        if args.infer_prompts is None:
+            save_dir = os.path.join(save_dir, f"{args.val_set}")
+        infer_config = f"{args.scheduler_type}{args.infer_steps}_{args.weight_dtype}_guidance{args.guidance_scale}"
+        save_dir = os.path.join(save_dir, infer_config)
+        os.makedirs(save_dir, exist_ok=True)
+        if args.from_scratch:
+            save_dir = os.path.join(save_dir, "from_scratch")
+        else:
+            save_dir = os.path.join(save_dir, "transfer")
+        save_dir = os.path.join(save_dir, f"start{args.start_noise}")
+        os.makedirs(save_dir, exist_ok=True)
+        with open(os.path.join(save_dir, "infer_args.yaml"), 'w') as f:
+            yaml.dump(vars(args), f)
+        # save code
+        code_dir = os.path.join(save_dir, "code")
+        os.makedirs(code_dir, exist_ok=True)
+        current_file = os.path.basename(__file__)
+        shutil.copy(__file__, os.path.join(code_dir, current_file))
+        with torch.no_grad():
+            pred_images, prompts = inference(network, tokenizer, text_encoder, vae, unet, noise_scheduler, dataloader, height, width,
+                                    args.scales, save_dir, seed, weight_dtype, device, args.batch_size, steps, guidance_scale=args.guidance_scale,
+                                    start_noise=args.start_noise, show=args.show, style_prompt=args.style_label, no_load=args.no_load,
+                                    from_scratch=args.from_scratch)
+            if args.ref_image_folder is not None:
+                flush()
+                print("Calculating metrics")
+                infer_metric(args.ref_image_folder, pred_images, save_dir, args.start_noise)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Authors: Hui Ren (rhfeiyang.github.io)

utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (203 Bytes). View file

utils/__pycache__/lora.cpython-39.pyc ADDED Viewed

Binary file (6.29 kB). View file

utils/__pycache__/metrics.cpython-39.pyc ADDED Viewed

Binary file (19.3 kB). View file

utils/__pycache__/train_util.cpython-39.pyc ADDED Viewed

Binary file (10.9 kB). View file

utils/art_filter.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# Authors: Hui Ren (rhfeiyang.github.io)
+from transformers import CLIPProcessor, CLIPModel
+import torch
+import numpy as np
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+from tqdm import tqdm
+class Caption_filter:
+    def __init__(self, filter_prompts=["painting", "paintings", "art", "artwork", "drawings", "sketch", "sketches", "illustration", "illustrations",
+                                       "sculpture","sculptures", "installation", "printmaking", "digital art", "conceptual art", "mosaic", "tapestry",
+                                       "abstract", "realism", "surrealism", "impressionism", "expressionism", "cubism", "minimalism", "baroque", "rococo",
+                                       "pop art", "art nouveau", "art deco", "futurism", "dadaism",
+                                        "stamp", "stamps", "advertisement", "advertisements","logo", "logos"
+                                       ],):
+        self.filter_prompts = filter_prompts
+        self.total_count=0
+        self.filter_count=[0]*len(filter_prompts)
+    def reset(self):
+        self.total_count=0
+        self.filter_count=[0]*len(self.filter_prompts)
+    def filter(self, captions):
+        filter_result = []
+        for caption in captions:
+            words = caption[0]
+            if words == None:
+                filter_result.append((True, "None"))
+                continue
+            words = words.lower()
+            words = words.split()
+            filt = False
+            reason=None
+            for i, filter_keyword in enumerate(self.filter_prompts):
+                key_len = len(filter_keyword.split())
+                for j in range(len(words)-key_len+1):
+                    if " ".join(words[j:j+key_len]) == filter_keyword:
+                        self.filter_count[i] += 1
+                        filt = True
+                        reason = filter_keyword
+                        break
+                if filt:
+                    break
+            filter_result.append((filt, reason))
+            self.total_count += 1
+        return filter_result
+class Clip_filter:
+    prompt_threshold = {
+        "painting": 17,
+        "art": 17.5,
+        "artwork": 19,
+        "drawing": 15.8,
+        "sketch": 17,
+        "illustration": 15,
+        "sculpture": 19.2,
+        "installation art": 20,
+        "printmaking art": 16.3,
+        "digital art": 15,
+        "conceptual art": 18,
+        "mosaic art": 19,
+        "tapestry": 16,
+        "abstract art":16.5,
+        "realism art": 16,
+        "surrealism art": 15,
+        "impressionism art": 17,
+        "expressionism art": 17,
+        "cubism art": 15,
+        "minimalism art": 16,
+        "baroque art": 17.5,
+        "rococo art": 17,
+        "pop art": 16,
+        "art nouveau": 19,
+        "art deco": 19,
+        "futurism art": 16.5,
+        "dadaism art": 16.5,
+        "stamp": 18,
+        "advertisement": 16.5,
+        "logo": 15.5,
+    }
+    @torch.no_grad()
+    def __init__(self, positive_prompt=["painting", "art", "artwork", "drawing", "sketch", "illustration",
+                                        "sculpture", "installation art", "printmaking art", "digital art", "conceptual art", "mosaic art", "tapestry",
+                                        "abstract art", "realism art", "surrealism art", "impressionism art", "expressionism art", "cubism art",
+                                        "minimalism art", "baroque art", "rococo art",
+                                        "pop art", "art nouveau", "art deco", "futurism art", "dadaism art",
+                                        "stamp", "advertisement",
+                                        "logo"
+                                        ],
+                  device="cuda"):
+        self.device = device
+        self.model = (CLIPModel.from_pretrained("openai/clip-vit-large-patch14")).to(device)
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        self.positive_prompt = positive_prompt
+        self.text = self.positive_prompt
+        self.tokenizer = self.processor.tokenizer
+        self.image_processor = self.processor.image_processor
+        self.text_encoding = self.tokenizer(self.text, return_tensors="pt", padding=True).to(device)
+        self.text_features = self.model.get_text_features(**self.text_encoding)
+        self.text_features = self.text_features / self.text_features.norm(p=2, dim=-1, keepdim=True)
+    @torch.no_grad()
+    def similarity(self, image):
+        # inputs = self.processor(text=self.text, images=image, return_tensors="pt", padding=True)
+        image_processed = self.image_processor(image, return_tensors="pt", padding=True).to(self.device, non_blocking=True)
+        inputs = {**self.text_encoding, **image_processed}
+        outputs = self.model(**inputs)
+        logits_per_image = outputs.logits_per_image
+        return logits_per_image
+    def get_logits(self, image):
+        logits_per_image = self.similarity(image)
+        return logits_per_image.cpu()
+    def get_image_features(self, image):
+        image_processed = self.image_processor(image, return_tensors="pt", padding=True).to(self.device, non_blocking=True)
+        image_features = self.model.get_image_features(**image_processed)
+        return image_features
+class Art_filter:
+    def __init__(self):
+        self.caption_filter = Caption_filter()
+        self.clip_filter = Clip_filter()
+    def caption_filt(self, dataloader):
+        self.caption_filter.reset()
+        dataloader.dataset.get_img = False
+        dataloader.dataset.get_cap = True
+        remain_ids = []
+        filtered_ids = []
+        for i, batch in tqdm(enumerate(dataloader)):
+            captions = batch["text"]
+            filter_result = self.caption_filter.filter(captions)
+            for j, (filt, reason) in enumerate(filter_result):
+                if filt:
+                    filtered_ids.append((batch["ids"][j], reason))
+                    if i%10==0:
+                        print(f"Filtered caption: {captions[j]}, reason: {reason}")
+                else:
+                    remain_ids.append(batch["ids"][j])
+        return {"remain_ids":remain_ids, "filtered_ids":filtered_ids, "total_count":self.caption_filter.total_count, "filter_count":self.caption_filter.filter_count, "filter_prompts":self.caption_filter.filter_prompts}
+    def clip_filt(self, clip_logits_ckpt:dict):
+        logits = clip_logits_ckpt["clip_logits"]
+        ids = clip_logits_ckpt["ids"]
+        text = clip_logits_ckpt["text"]
+        filt_mask = torch.zeros(logits.shape[0], dtype=torch.bool)
+        for i, prompt in enumerate(text):
+            threshold = Clip_filter.prompt_threshold[prompt]
+            filt_mask = filt_mask | (logits[:,i] >= threshold)
+        filt_ids = []
+        remain_ids = []
+        for i, id in enumerate(ids):
+            if filt_mask[i]:
+                filt_ids.append(id)
+            else:
+                remain_ids.append(id)
+        return {"remain_ids":remain_ids, "filtered_ids":filt_ids}
+    def clip_feature(self, dataloader):
+        dataloader.dataset.get_img = True
+        dataloader.dataset.get_cap = False
+        clip_features = []
+        ids = []
+        for i, batch in enumerate(dataloader):
+            images = batch["images"]
+            features = self.clip_filter.get_image_features(images).cpu()
+            clip_features.append(features)
+            ids.extend(batch["ids"])
+        clip_features = torch.cat(clip_features)
+        return {"clip_features":clip_features, "ids":ids}
+    def clip_logit(self, dataloader):
+        dataloader.dataset.get_img = True
+        dataloader.dataset.get_cap = False
+        clip_features = []
+        clip_logits = []
+        ids = []
+        for i, batch in enumerate(dataloader):
+            images = batch["images"]
+            # logits = self.clip_filter.get_logits(images)
+            feature = self.clip_filter.get_image_features(images)
+            logits = self.clip_logit_by_feat(feature)["clip_logits"]
+            clip_features.append(feature)
+            clip_logits.append(logits)
+            ids.extend(batch["ids"])
+        clip_features = torch.cat(clip_features)
+        clip_logits = torch.cat(clip_logits)
+        return {"clip_features":clip_features, "clip_logits":clip_logits, "ids":ids, "text": self.clip_filter.text}
+    def clip_logit_by_feat(self, feature):
+        feature = feature.clone().to(self.clip_filter.device)
+        feature = feature / feature.norm(p=2, dim=-1, keepdim=True)
+        logit_scale = self.clip_filter.model.logit_scale.exp()
+        logits = ((feature @ self.clip_filter.text_features.T) * logit_scale).cpu()
+        return {"clip_logits":logits, "text": self.clip_filter.text}
+if __name__ == "__main__":
+    import pickle
+    with open("/vision-nfs/torralba/scratch/jomat/sam_dataset/filt_result/sa_000000/clip_logits_result.pickle","rb") as f:
+        result=pickle.load(f)
+    feat = result['clip_features']
+    logits =Art_filter().clip_logit_by_feat(feat)
+    print(logits)

utils/config_util.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from typing import Literal, Optional
+import yaml
+from pydantic import BaseModel
+import torch
+from lora import TRAINING_METHODS
+PRECISION_TYPES = Literal["fp32", "fp16", "bf16", "float32", "float16", "bfloat16"]
+NETWORK_TYPES = Literal["lierla", "c3lier"]
+class PretrainedModelConfig(BaseModel):
+    name_or_path: str
+    ckpt_path: Optional[str] = None
+    v2: bool = False
+    v_pred: bool = False
+    clip_skip: Optional[int] = None
+class NetworkConfig(BaseModel):
+    type: NETWORK_TYPES = "lierla"
+    rank: int = 4
+    alpha: float = 1.0
+    training_method: TRAINING_METHODS = "full"
+class TrainConfig(BaseModel):
+    precision: PRECISION_TYPES = "bfloat16"
+    noise_scheduler: Literal["ddim", "ddpm", "lms", "euler_a"] = "ddim"
+    iterations: int = 500
+    lr: float = 1e-4
+    optimizer: str = "adamw"
+    optimizer_args: str = ""
+    lr_scheduler: str = "constant"
+    max_denoising_steps: int = 50
+class SaveConfig(BaseModel):
+    name: str = "untitled"
+    path: str = "./output"
+    per_steps: int = 200
+    precision: PRECISION_TYPES = "float32"
+class LoggingConfig(BaseModel):
+    use_wandb: bool = False
+    verbose: bool = False
+class OtherConfig(BaseModel):
+    use_xformers: bool = False
+class RootConfig(BaseModel):
+    # prompts_file: str
+    pretrained_model: PretrainedModelConfig
+    network: NetworkConfig
+    train: Optional[TrainConfig]
+    save: Optional[SaveConfig]
+    logging: Optional[LoggingConfig]
+    other: Optional[OtherConfig]
+def parse_precision(precision: str) -> torch.dtype:
+    if precision == "fp32" or precision == "float32":
+        return torch.float32
+    elif precision == "fp16" or precision == "float16":
+        return torch.float16
+    elif precision == "bf16" or precision == "bfloat16":
+        return torch.bfloat16
+    raise ValueError(f"Invalid precision type: {precision}")
+def load_config_from_yaml(config_path: str) -> RootConfig:
+    with open(config_path, "r") as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+    root = RootConfig(**config)
+    if root.train is None:
+        root.train = TrainConfig()
+    if root.save is None:
+        root.save = SaveConfig()
+    if root.logging is None:
+        root.logging = LoggingConfig()
+    if root.other is None:
+        root.other = OtherConfig()
+    return root

utils/debug_util.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# デバッグ用...
+import torch
+def check_requires_grad(model: torch.nn.Module):
+    for name, module in list(model.named_modules())[:5]:
+        if len(list(module.parameters())) > 0:
+            print(f"Module: {name}")
+            for name, param in list(module.named_parameters())[:2]:
+                print(f"    Parameter: {name}, Requires Grad: {param.requires_grad}")
+def check_training_mode(model: torch.nn.Module):
+    for name, module in list(model.named_modules())[:5]:
+        print(f"Module: {name}, Training Mode: {module.training}")

utils/lora.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# ref:
+# - https://github.com/cloneofsimo/lora/blob/master/lora_diffusion/lora.py
+# - https://github.com/kohya-ss/sd-scripts/blob/main/networks/lora.py
+import os
+import math
+from typing import Optional, List, Type, Set, Literal
+import torch
+import torch.nn as nn
+from diffusers import UNet2DConditionModel
+from safetensors.torch import save_file
+UNET_TARGET_REPLACE_MODULE_TRANSFORMER = [
+#     "Transformer2DModel",  # どうやらこっちの方らしい？ # attn1, 2
+    "Attention"
+]
+UNET_TARGET_REPLACE_MODULE_CONV = [
+    "ResnetBlock2D",
+    "Downsample2D",
+    "Upsample2D",
+    #     "DownBlock2D",
+    #     "UpBlock2D"
+]  # locon, 3clier
+LORA_PREFIX_UNET = "lora_unet"
+DEFAULT_TARGET_REPLACE = UNET_TARGET_REPLACE_MODULE_TRANSFORMER
+TRAINING_METHODS = Literal[
+    "noxattn",  # train all layers except x-attns and time_embed layers
+    "innoxattn",  # train all layers except self attention layers
+    "selfattn",  # ESD-u, train only self attention layers
+    "xattn",  # ESD-x, train only x attention layers
+    "full",  #  train all layers
+    "xattn-strict", # q and k values
+    "noxattn-hspace",
+    "noxattn-hspace-last",
+    # "xlayer",
+    # "outxattn",
+    # "outsattn",
+    # "inxattn",
+    # "inmidsattn",
+    # "selflayer",
+]
+class LoRAModule(nn.Module):
+    """
+    replaces forward method of the original Linear, instead of replacing the original Linear module.
+    """
+    def __init__(
+        self,
+        lora_name,
+        org_module: nn.Module,
+        multiplier=1.0,
+        lora_dim=4,
+        alpha=1,
+    ):
+        """if alpha == 0 or None, alpha is rank (no scaling)."""
+        super().__init__()
+        self.lora_name = lora_name
+        self.lora_dim = lora_dim
+        if "Linear" in org_module.__class__.__name__:
+            in_dim = org_module.in_features
+            out_dim = org_module.out_features
+            self.lora_down = nn.Linear(in_dim, lora_dim, bias=False)
+            self.lora_up = nn.Linear(lora_dim, out_dim, bias=False)
+        elif "Conv" in org_module.__class__.__name__:  # 一応
+            in_dim = org_module.in_channels
+            out_dim = org_module.out_channels
+            self.lora_dim = min(self.lora_dim, in_dim, out_dim)
+            if self.lora_dim != lora_dim:
+                print(f"{lora_name} dim (rank) is changed to: {self.lora_dim}")
+            kernel_size = org_module.kernel_size
+            stride = org_module.stride
+            padding = org_module.padding
+            self.lora_down = nn.Conv2d(
+                in_dim, self.lora_dim, kernel_size, stride, padding, bias=False
+            )
+            self.lora_up = nn.Conv2d(self.lora_dim, out_dim, (1, 1), (1, 1), bias=False)
+        if type(alpha) == torch.Tensor:
+            alpha = alpha.detach().numpy()
+        alpha = lora_dim if alpha is None or alpha == 0 else alpha
+        self.scale = alpha / self.lora_dim
+        self.register_buffer("alpha", torch.tensor(alpha))  # 定数として扱える
+        # same as microsoft's
+        nn.init.kaiming_uniform_(self.lora_down.weight, a=math.sqrt(5))
+        nn.init.zeros_(self.lora_up.weight)
+        self.multiplier = multiplier
+        self.org_module = org_module  # remove in applying
+    def apply_to(self):
+        self.org_forward = self.org_module.forward
+        self.org_module.forward = self.forward
+        del self.org_module
+    def forward(self, x):
+        return (
+            self.org_forward(x)
+            + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+        )
+class LoRANetwork(nn.Module):
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        rank: int = 4,
+        multiplier: float = 1.0,
+        alpha: float = 1.0,
+        train_method: TRAINING_METHODS = "full",
+    ) -> None:
+        super().__init__()
+        self.lora_scale = 1
+        self.multiplier = multiplier
+        self.lora_dim = rank
+        self.alpha = alpha
+        self.module = LoRAModule
+        self.unet_loras = self.create_modules(
+            LORA_PREFIX_UNET,
+            unet,
+            DEFAULT_TARGET_REPLACE,
+            self.lora_dim,
+            self.multiplier,
+            train_method=train_method,
+        )
+        print(f"create LoRA for U-Net: {len(self.unet_loras)} modules.")
+        lora_names = set()
+        for lora in self.unet_loras:
+            assert (
+                lora.lora_name not in lora_names
+            ), f"duplicated lora name: {lora.lora_name}. {lora_names}"
+            lora_names.add(lora.lora_name)
+        for lora in self.unet_loras:
+            lora.apply_to()
+            self.add_module(
+                lora.lora_name,
+                lora,
+            )
+        del unet
+        torch.cuda.empty_cache()
+    def create_modules(
+        self,
+        prefix: str,
+        root_module: nn.Module,
+        target_replace_modules: List[str],
+        rank: int,
+        multiplier: float,
+        train_method: TRAINING_METHODS,
+    ) -> list:
+        loras = []
+        names = []
+        for name, module in root_module.named_modules():
+            if train_method == "noxattn" or train_method == "noxattn-hspace" or train_method == "noxattn-hspace-last":  # Cross Attention と Time Embed 以外学習
+                if "attn2" in name or "time_embed" in name:
+                    continue
+            elif train_method == "innoxattn":  # Cross Attention 以外学習
+                if "attn2" in name:
+                    continue
+            elif train_method == "selfattn":  # Self Attention のみ学習
+                if "attn1" not in name:
+                    continue
+            elif train_method == "xattn" or train_method == "xattn-strict":  # Cross Attention のみ学習
+                if "attn2" not in name:
+                    continue
+            elif train_method == "attn":
+                if "attn1" not in name and "attn2" not in name:
+                    continue
+            elif train_method == "full":
+                pass
+            # else:
+            #     raise NotImplementedError(
+            #         f"train_method: {train_method} is not implemented."
+            #     )
+            ##
+            # union condition(b-lora)
+            else:
+                discard = True
+                if "all_up" in train_method:
+                    if "up_blocks" in name:
+                        discard = False
+                if "down_1" in train_method:
+                    if not ("down_blocks.1" not in name or "attentions" not in name):
+                        discard = False
+                if "down_2" in train_method:
+                    if not ("down_blocks.2" not in name or "attentions" not in name):
+                        discard = False
+                if "up_1" in train_method:
+                    if not ("up_blocks.1" not in name or "attentions" not in name):
+                        discard = False
+                if "up_2" in train_method:
+                    if not ("up_blocks.2" not in name or "attentions" not in name):
+                        discard = False
+                if discard:
+                    continue
+            ##
+            if module.__class__.__name__ in target_replace_modules:
+                for child_name, child_module in module.named_modules():
+                    if child_module.__class__.__name__ in ["Linear", "Conv2d", "LoRACompatibleLinear", "LoRACompatibleConv"]:
+                        if train_method == 'xattn-strict':
+                            if 'out' in child_name:
+                                continue
+                        if train_method == 'noxattn-hspace':
+                            if 'mid_block' not in name:
+                                continue
+                        if train_method == 'noxattn-hspace-last':
+                            if 'mid_block' not in name or '.1' not in name or 'conv2' not in child_name:
+                                continue
+                        lora_name = prefix + "." + name + "." + child_name
+                        lora_name = lora_name.replace(".", "_")
+                        # print(f"{lora_name}")
+                        lora = self.module(
+                            lora_name, child_module, multiplier, rank, self.alpha
+                        )
+#                         print(name, child_name)
+#                         print(child_module.weight.shape)
+                        loras.append(lora)
+                        names.append(lora_name)
+#         print(f'@@@@@@@@@@@@@@@@@@@@@@@@@@@@ \n {names}')
+        return loras
+    def prepare_optimizer_params(self):
+        all_params = []
+        if self.unet_loras:  # 実質これしかない
+            params = []
+            [params.extend(lora.parameters()) for lora in self.unet_loras]
+            param_data = {"params": params}
+            all_params.append(param_data)
+        return all_params
+    def save_weights(self, file, dtype=None, metadata: Optional[dict] = None):
+        state_dict = self.state_dict()
+        if dtype is not None:
+            for key in list(state_dict.keys()):
+                v = state_dict[key]
+                v = v.detach().clone().to("cpu").to(dtype)
+                state_dict[key] = v
+#         for key in list(state_dict.keys()):
+#             if not key.startswith("lora"):
+#                 # lora以外除外
+#                 del state_dict[key]
+        if os.path.splitext(file)[1] == ".safetensors":
+            save_file(state_dict, file, metadata)
+        else:
+            torch.save(state_dict, file)
+    def set_lora_slider(self, scale):
+        self.lora_scale = scale
+    def __enter__(self):
+        for lora in self.unet_loras:
+            lora.multiplier = 1.0 * self.lora_scale
+    def __exit__(self, exc_type, exc_value, tb):
+        for lora in self.unet_loras:
+            lora.multiplier = 0

utils/metrics.py ADDED Viewed

	@@ -0,0 +1,577 @@

+# Authors: Hui Ren (rhfeiyang.github.io)
+import os
+import numpy as np
+from torchvision import transforms
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.autograd import Function
+from PIL import Image
+from transformers import CLIPProcessor, CLIPModel
+from collections import OrderedDict
+from transformers import BatchFeature
+import clip
+import copy
+import lpips
+from transformers import ViTImageProcessor, ViTModel
+## CSD_CLIP
+def convert_weights_float(model: nn.Module):
+    """Convert applicable model parameters to fp32"""
+    def _convert_weights_to_fp32(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.float()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.float()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.float()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.float()
+    model.apply(_convert_weights_to_fp32)
+class ReverseLayerF(Function):
+    @staticmethod
+    def forward(ctx, x, alpha):
+        ctx.alpha = alpha
+        return x.view_as(x)
+    @staticmethod
+    def backward(ctx, grad_output):
+        output = grad_output.neg() * ctx.alpha
+        return output, None
+## taken from https://github.com/moein-shariatnia/OpenAI-CLIP/blob/master/modules.py
+class ProjectionHead(nn.Module):
+    def __init__(
+            self,
+            embedding_dim,
+            projection_dim,
+            dropout=0
+    ):
+        super().__init__()
+        self.projection = nn.Linear(embedding_dim, projection_dim)
+        self.gelu = nn.GELU()
+        self.fc = nn.Linear(projection_dim, projection_dim)
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(projection_dim)
+    def forward(self, x):
+        projected = self.projection(x)
+        x = self.gelu(projected)
+        x = self.fc(x)
+        x = self.dropout(x)
+        x = x + projected
+        x = self.layer_norm(x)
+        return x
+def convert_state_dict(state_dict):
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith("module."):
+            k = k.replace("module.", "")
+        new_state_dict[k] = v
+    return new_state_dict
+def init_weights(m):
+    if isinstance(m, nn.Linear):
+        torch.nn.init.xavier_uniform_(m.weight)
+        if m.bias is not None:
+            nn.init.normal_(m.bias, std=1e-6)
+class Metric(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.image_preprocess = None
+    def load_image(self, image_path):
+        with open(image_path, 'rb') as f:
+            image = Image.open(f).convert("RGB")
+        return image
+    def load_image_path(self, image_path):
+        if isinstance(image_path, str):
+            # should be a image folder path
+            images_file = os.listdir(image_path)
+            images = [os.path.join(image_path, image) for image in images_file if
+                      image.endswith(".jpg") or image.endswith(".png")]
+        if isinstance(image_path[0], str):
+            images = [self.load_image(image) for image in image_path]
+        elif isinstance(image_path[0], np.ndarray):
+            images = [Image.fromarray(image) for image in image_path]
+        elif isinstance(image_path[0], Image.Image):
+            images = image_path
+        else:
+            raise Exception("Invalid input")
+        return images
+    def preprocess_image(self, image, **kwargs):
+        if (isinstance(image, str) and os.path.isdir(image)) or (isinstance(image, list) and (isinstance(image[0], Image.Image) or isinstance(image[0], np.ndarray) or os.path.isfile(image[0]))):
+            input_data = self.load_image_path(image)
+            input_data = [self.image_preprocess(image, **kwargs) for image in input_data]
+            input_data = torch.stack(input_data)
+        elif os.path.isfile(image):
+            input_data = self.load_image(image)
+            input_data = self.image_preprocess(input_data, **kwargs)
+            input_data = input_data.unsqueeze(0)
+        elif isinstance(image, torch.Tensor):
+            raise Exception("Unsupported input")
+        return input_data
+class Clip_Basic_Metric(Metric):
+    def __init__(self):
+        super().__init__()
+        self.tensor_preprocess = transforms.Compose([
+            transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
+            # transforms.rescale
+            transforms.Normalize(mean=[-1.0, -1.0, -1.0], std=[2.0, 2.0, 2.0]),
+            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ])
+        self.image_preprocess = transforms.Compose([
+            transforms.Resize(size=224, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ])
+class Clip_metric(Clip_Basic_Metric):
+    @torch.no_grad()
+    def __init__(self, target_style_prompt: str=None, clip_model_name="openai/clip-vit-large-patch14", device="cuda",
+                 bath_size=8, alpha=0.5):
+        super().__init__()
+        self.device = device
+        self.alpha = alpha
+        self.model = (CLIPModel.from_pretrained(clip_model_name)).to(device)
+        self.processor = CLIPProcessor.from_pretrained(clip_model_name)
+        self.tokenizer = self.processor.tokenizer
+        self.image_processor = self.processor.image_processor
+        # self.style_class_features = self.get_text_features(self.styles).cpu()
+        self.style_class_features=[]
+        # self.noise_prompt_features = self.get_text_features("Noise")
+        self.model.eval()
+        self.batch_size = bath_size
+        if target_style_prompt is not None:
+            self.ref_style_features = self.get_text_features(target_style_prompt)
+        else:
+            self.ref_style_features = None
+        self.ref_image_style_prototype = None
+    def get_text_features(self, text):
+        prompt_encoding = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(self.device)
+        prompt_features = self.model.get_text_features(**prompt_encoding).to(self.device)
+        prompt_features = F.normalize(prompt_features, p=2, dim=-1)
+        return prompt_features
+    def get_image_features(self, images):
+        # if isinstance(image, torch.Tensor):
+        #     self.tensor_transform(image)
+        # else:
+        #     image_features = self.image_processor(image, return_tensors="pt", padding=True).to(self.device, non_blocking=True)
+        images = self.load_image_path(images)
+        if isinstance(images, torch.Tensor):
+            images = self.tensor_preprocess(images)
+            data = {"pixel_values": images}
+            image_features = BatchFeature(data=data, tensor_type="pt")
+        else:
+            image_features = self.image_processor(images, return_tensors="pt", padding=True).to(self.device,
+                                                                                                non_blocking=True)
+        image_features = self.model.get_image_features(**image_features).to(self.device)
+        image_features = F.normalize(image_features, p=2, dim=-1)
+        return image_features
+    def img_text_similarity(self, image_features, text=None):
+        if text is not None:
+            prompt_feature = self.get_text_features(text)
+            if isinstance(text, str):
+                prompt_feature = prompt_feature.repeat(len(image_features), 1)
+        else:
+            prompt_feature = self.ref_style_features
+        similarity_each = torch.einsum("nc, nc -> n", image_features, prompt_feature)
+        return similarity_each
+    def forward(self, output_imgs, prompt=None):
+        image_features = self.get_image_features(output_imgs)
+        # print(image_features)
+        style_score = self.img_text_similarity(image_features.mean(dim=0, keepdim=True))
+        if prompt is not None:
+            content_score = self.img_text_similarity(image_features, prompt)
+            score = self.alpha * style_score + (1 - self.alpha) * content_score
+            return {"score": score, "style_score": style_score, "content_score": content_score}
+        else:
+            return {"style_score": style_score}
+    def content_score(self, output_imgs, prompt):
+        self.to(self.device)
+        image_features = self.get_image_features(output_imgs)
+        content_score_details = self.img_text_similarity(image_features, prompt)
+        self.to("cpu")
+        return {"CLIP_content_score": content_score_details.mean().cpu(), "CLIP_content_score_details": content_score_details.cpu()}
+class CSD_CLIP(Clip_Basic_Metric):
+    """backbone + projection head"""
+    def __init__(self, name='vit_large',content_proj_head='default', ckpt_path = "data/weights/CSD-checkpoint.pth", device="cuda",
+                 alpha=0.5, **kwargs):
+        super(CSD_CLIP, self).__init__()
+        self.alpha = alpha
+        self.content_proj_head = content_proj_head
+        self.device = device
+        if name == 'vit_large':
+            clipmodel, _ = clip.load("ViT-L/14")
+            self.backbone = clipmodel.visual
+            self.embedding_dim = 1024
+        elif name == 'vit_base':
+            clipmodel, _ = clip.load("ViT-B/16")
+            self.backbone = clipmodel.visual
+            self.embedding_dim = 768
+            self.feat_dim = 512
+        else:
+            raise Exception('This model is not implemented')
+        convert_weights_float(self.backbone)
+        self.last_layer_style = copy.deepcopy(self.backbone.proj)
+        if content_proj_head == 'custom':
+            self.last_layer_content = ProjectionHead(self.embedding_dim,self.feat_dim)
+            self.last_layer_content.apply(init_weights)
+        else:
+            self.last_layer_content = copy.deepcopy(self.backbone.proj)
+        self.backbone.proj = None
+        self.backbone.requires_grad_(False)
+        self.last_layer_style.requires_grad_(False)
+        self.last_layer_content.requires_grad_(False)
+        self.backbone.eval()
+        if ckpt_path is not None:
+            self.load_ckpt(ckpt_path)
+        self.to("cpu")
+    def load_ckpt(self, ckpt_path):
+        checkpoint = torch.load(ckpt_path, map_location="cpu")
+        state_dict = convert_state_dict(checkpoint['model_state_dict'])
+        msg = self.load_state_dict(state_dict, strict=False)
+        print(f"=> loaded CSD_CLIP checkpoint with msg {msg}")
+    @property
+    def dtype(self):
+        return self.backbone.conv1.weight.dtype
+    def get_image_features(self, input_data, get_style=True,get_content=False,feature_alpha=None):
+        if isinstance(input_data, torch.Tensor):
+            input_data = self.tensor_preprocess(input_data)
+        elif (isinstance(input_data, str) and os.path.isdir(input_data)) or (isinstance(input_data, list) and (isinstance(input_data[0], Image.Image) or isinstance(input_data[0], np.ndarray) or os.path.isfile(input_data[0]))):
+            input_data = self.load_image_path(input_data)
+            input_data = [self.image_preprocess(image) for image in input_data]
+            input_data = torch.stack(input_data)
+        elif os.path.isfile(input_data):
+            input_data = self.load_image(input_data)
+            input_data = self.image_preprocess(input_data)
+            input_data = input_data.unsqueeze(0)
+        input_data = input_data.to(self.device)
+        style_output = None
+        feature = self.backbone(input_data)
+        if get_style:
+            style_output = feature @ self.last_layer_style
+            # style_output = style_output.mean(dim=0)
+            style_output = nn.functional.normalize(style_output, dim=-1, p=2)
+        content_output=None
+        if get_content:
+            if feature_alpha is not None:
+                reverse_feature = ReverseLayerF.apply(feature, feature_alpha)
+            else:
+                reverse_feature = feature
+            # if alpha is not None:
+            if self.content_proj_head == 'custom':
+                content_output =  self.last_layer_content(reverse_feature)
+            else:
+                content_output = reverse_feature @ self.last_layer_content
+            content_output = nn.functional.normalize(content_output, dim=-1, p=2)
+        return feature, content_output, style_output
+    @torch.no_grad()
+    def define_ref_image_style_prototype(self, ref_image_path: str):
+        self.to(self.device)
+        _, _, self.ref_style_feature = self.get_image_features(ref_image_path)
+        self.to("cpu")
+        # self.ref_style_feature = self.ref_style_feature.mean(dim=0)
+    @torch.no_grad()
+    def forward(self, styled_data):
+        self.to(self.device)
+        # get_content_feature = original_data is not None
+        _, content_output, style_output = self.get_image_features(styled_data, get_content=False)
+        style_similarities = style_output @ self.ref_style_feature.T
+        mean_style_similarities = style_similarities.mean(dim=-1)
+        mean_style_similarity = mean_style_similarities.mean()
+        max_style_similarities_v, max_style_similarities_id = style_similarities.max(dim=-1)
+        max_style_similarity = max_style_similarities_v.mean()
+        self.to("cpu")
+        return {"CSD_similarity_mean": mean_style_similarity, "CSD_similarity_max": max_style_similarity, "CSD_similarity_mean_details": mean_style_similarities,
+                "CSD_similarity_max_v_details": max_style_similarities_v, "CSD_similarity_max_id_details": max_style_similarities_id}
+    def get_style_loss(self, styled_data):
+        _, _, style_output = self.get_image_features(styled_data, get_style=True, get_content=False)
+        style_similarity = (style_output @ self.ref_style_feature).mean()
+        loss = 1 - style_similarity
+        return loss.mean()
+class LPIPS_metric(Metric):
+    def __init__(self, type="vgg", device="cuda"):
+        super(LPIPS_metric, self).__init__()
+        self.lpips = lpips.LPIPS(net=type)
+        self.device = device
+        self.image_preprocess = transforms.Compose([
+            transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
+            transforms.CenterCrop(256),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+        ])
+        self.to("cpu")
+    @torch.no_grad()
+    def forward(self, img1, img2):
+        self.to(self.device)
+        differences = []
+        for i in range(0, len(img1), 50):
+            img1_batch = img1[i:i+50]
+            img2_batch = img2[i:i+50]
+            img1_batch = self.preprocess_image(img1_batch).to(self.device)
+            img2_batch = self.preprocess_image(img2_batch).to(self.device)
+            differences.append(self.lpips(img1_batch, img2_batch).squeeze())
+        differences = torch.cat(differences)
+        difference = differences.mean()
+        # similarity = 1 - difference
+        self.to("cpu")
+        return {"LPIPS_content_difference": difference,  "LPIPS_content_difference_details": differences}
+class Vit_metric(Metric):
+    def __init__(self, device="cuda"):
+        super(Vit_metric, self).__init__()
+        self.device = device
+        self.model = ViTModel.from_pretrained('facebook/dino-vitb8').eval()
+        self.image_processor = ViTImageProcessor.from_pretrained('facebook/dino-vitb8')
+        self.to("cpu")
+    def get_image_features(self, images):
+        # if isinstance(image, torch.Tensor):
+        #     self.tensor_transform(image)
+        # else:
+        #     image_features = self.image_processor(image, return_tensors="pt", padding=True).to(self.device, non_blocking=True)
+        images = self.load_image_path(images)
+        batch_size = 20
+        all_image_features = []
+        for i in range(0, len(images), batch_size):
+            image_batch = images[i:i+batch_size]
+            if isinstance(image_batch, torch.Tensor):
+                image_batch = self.tensor_preprocess(image_batch)
+                data = {"pixel_values": image_batch}
+                image_processed = BatchFeature(data=data, tensor_type="pt")
+            else:
+                image_processed = self.image_processor(image_batch, return_tensors="pt").to(self.device)
+            image_features = self.model(**image_processed).last_hidden_state.flatten(start_dim=1)
+            image_features = F.normalize(image_features, p=2, dim=-1)
+            all_image_features.append(image_features)
+        all_image_features = torch.cat(all_image_features)
+        return all_image_features
+    @torch.no_grad()
+    def content_metric(self, img1, img2):
+        self.to(self.device)
+        if not(isinstance(img1, torch.Tensor) and len(img1.shape) == 2):
+            img1 = self.get_image_features(img1)
+        if not(isinstance(img2, torch.Tensor) and len(img2.shape) == 2):
+            img2 = self.get_image_features(img2)
+        similarities = torch.einsum("nc, nc -> n", img1, img2)
+        similarity = similarities.mean()
+        # self.to("cpu")
+        return {"Vit_content_similarity": similarity, "Vit_content_similarity_details": similarities}
+    # style
+    @torch.no_grad()
+    def define_ref_image_style_prototype(self, ref_image_path: str):
+        self.to(self.device)
+        self.ref_style_feature = self.get_image_features(ref_image_path)
+        self.to("cpu")
+    @torch.no_grad()
+    def style_metric(self, styled_data):
+        self.to(self.device)
+        if isinstance(styled_data, torch.Tensor) and len(styled_data.shape) == 2:
+            style_output = styled_data
+        else:
+            style_output = self.get_image_features(styled_data)
+        style_similarities = style_output @ self.ref_style_feature.T
+        mean_style_similarities = style_similarities.mean(dim=-1)
+        mean_style_similarity = mean_style_similarities.mean()
+        max_style_similarities_v, max_style_similarities_id = style_similarities.max(dim=-1)
+        max_style_similarity = max_style_similarities_v.mean()
+        # self.to("cpu")
+        return {"Vit_style_similarity_mean": mean_style_similarity, "Vit_style_similarity_max": max_style_similarity, "Vit_style_similarity_mean_details": mean_style_similarities,
+                "Vit_style_similarity_max_v_details": max_style_similarities_v, "Vit_style_similarity_max_id_details": max_style_similarities_id}
+    @torch.no_grad()
+    def forward(self, styled_data, original_data=None):
+        self.to(self.device)
+        styled_features = self.get_image_features(styled_data)
+        ret ={}
+        if original_data is not None:
+            content_metric = self.content_metric(styled_features, original_data)
+            ret["Vit_content"] = content_metric
+        style_metric = self.style_metric(styled_features)
+        ret["Vit_style"] = style_metric
+        self.to("cpu")
+        return ret
+class StyleContentMetric(nn.Module):
+    def __init__(self, style_ref_image_folder, device="cuda"):
+        super(StyleContentMetric, self).__init__()
+        self.device = device
+        self.clip_style_metric = CSD_CLIP(device=device)
+        self.ref_image_file = os.listdir(style_ref_image_folder)
+        self.ref_image_file = [i for i in self.ref_image_file if i.endswith(".jpg") or i.endswith(".png")]
+        self.ref_image_file.sort()
+        self.ref_image_file = np.array(self.ref_image_file)
+        ref_image_file_path = [os.path.join(style_ref_image_folder, i) for i in self.ref_image_file]
+        self.clip_style_metric.define_ref_image_style_prototype(ref_image_file_path)
+        self.vit_metric = Vit_metric(device=device)
+        self.vit_metric.define_ref_image_style_prototype(ref_image_file_path)
+        self.lpips_metric = LPIPS_metric(device=device)
+        self.clip_content_metric = Clip_metric(alpha=0, target_style_prompt=None)
+        self.to("cpu")
+    def forward(self, styled_data, original_data=None, content_caption=None):
+        ret ={}
+        csd_score = self.clip_style_metric(styled_data)
+        csd_score["max_query"] = self.ref_image_file[csd_score["CSD_similarity_max_id_details"].cpu()].tolist()
+        torch.cuda.empty_cache()
+        ret["Style_CSD"] = csd_score
+        vit_score = self.vit_metric(styled_data, original_data)
+        torch.cuda.empty_cache()
+        vit_style = vit_score["Vit_style"]
+        vit_style["max_query"] = self.ref_image_file[vit_style["Vit_style_similarity_max_id_details"].cpu()].tolist()
+        ret["Style_VIT"] = vit_style
+        if original_data is not None:
+            vit_content = vit_score["Vit_content"]
+            ret["Content_VIT"] = vit_content
+            lpips_score = self.lpips_metric(styled_data, original_data)
+            torch.cuda.empty_cache()
+            ret["Content_LPIPS"] = lpips_score
+        if content_caption is not None:
+            clip_content = self.clip_content_metric.content_score(styled_data, content_caption)
+            ret["Content_CLIP"] = clip_content
+            torch.cuda.empty_cache()
+        for type_key, type_value in ret.items():
+            for key, value in type_value.items():
+                if isinstance(value, torch.Tensor):
+                    if value.numel() == 1:
+                        ret[type_key][key] = round(value.item(), 4)
+                    else:
+                        ret[type_key][key] = value.tolist()
+                        ret[type_key][key] = [round(v, 4) for v in ret[type_key][key]]
+        self.to("cpu")
+        ret["ref_image_file"] = self.ref_image_file.tolist()
+        return ret
+if __name__ == "__main__":
+    with torch.no_grad():
+        metric = StyleContentMetric("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/clip_dissection/Art_styles/camille-pissarro/impressionism/split_5/paintings")
+        score = metric("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset3/converted_photo/500",
+                       "/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset3/paintings")
+        print(score)
+        lpips = LPIPS_metric()
+        score = lpips("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset3/paintings",
+                      "/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset3/converted_photo/500")
+        print("lpips", score)
+        clip_metric = CSD_CLIP()
+        clip_metric.define_ref_image_style_prototype(
+            "/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset1/paintings")
+        score = clip_metric(
+            "/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset3/converted_photo/500")
+        print("subset3-subset3_sd14_converted", score)
+        score = clip_metric(
+            "/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/imgFolder/clip_filtered_remain_500")
+        print("subset3-photo", score)
+        score = clip_metric(
+            "/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset1/paintings")
+        print("subset3-subset1", score)
+        score = clip_metric(
+            "/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/andy-warhol/pop_art/subset1/paintings")
+        print("subset3-andy", score)
+        # score = clip_metric("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset3/paintings", "A painting")
+        # print("subset3",score)
+        # score_subset2 = clip_metric("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset2/paintings")
+        # print("subset2",score_subset2)
+        # score_subset3 = clip_metric("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset3/paintings")
+        # print("subset3",score_subset3)
+        #
+        # score_subset3_converted = clip_metric("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset3/converted_photo/500")
+        # print("subset3-subset3_sd14_converted" , score_subset3_converted)
+        #
+        # score_subset3_coco_converted = clip_metric("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset3/coco_converted_photo/500")
+        # print("subset3-subset3_coco_converted" , score_subset3_coco_converted)
+        #
+        # clip_metric = Clip_metric("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/imgFolder/sketch_500")
+        # score = clip_metric("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/imgFolder/clip_filtered_remain_500")
+        # print("photo500_1-sketch" ,score)
+        #
+        # clip_metric = Clip_metric("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/imgFolder/clip_filtered_remain_500")
+        # score = clip_metric("/afs/csail.mit.edu/u/h/huiren/code/diffusion/stable_diffusion/imgFolder/clip_filtered_remain_500_new")
+        # print("photo500_1-photo500_2" ,score)
+        # from custom_datasets.imagepair import ImageSet
+        # import matplotlib.pyplot as plt
+        # dataset = ImageSet(folder = "/data/vision/torralba/scratch/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset3/paintings",
+        #                    caption_path="/data/vision/torralba/scratch/huiren/code/diffusion/stable_diffusion/custom_datasets/wikiart/data/gustav-klimt_Art_Nouveau/subset3/captions",
+        #                     keep_in_mem=False)
+        # for sample in dataset:
+        #     score = clip_metric.content_score(sample["image"], sample["caption"][0])
+        #     plt.imshow(sample["image"])
+        #     plt.title(f"score: {round(score.item(),2)}\n prompt: {sample['caption'][0]}")
+        #     plt.show()

utils/model_util.py ADDED Viewed

	@@ -0,0 +1,291 @@

+from typing import Literal, Union, Optional
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjection
+from diffusers import (
+    UNet2DConditionModel,
+    SchedulerMixin,
+    StableDiffusionPipeline,
+    StableDiffusionXLPipeline,
+    AutoencoderKL,
+)
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DDPMScheduler,
+    LMSDiscreteScheduler,
+    EulerAncestralDiscreteScheduler,
+)
+TOKENIZER_V1_MODEL_NAME = "CompVis/stable-diffusion-v1-4"
+TOKENIZER_V2_MODEL_NAME = "stabilityai/stable-diffusion-2-1"
+AVAILABLE_SCHEDULERS = Literal["ddim", "ddpm", "lms", "euler_a"]
+SDXL_TEXT_ENCODER_TYPE = Union[CLIPTextModel, CLIPTextModelWithProjection]
+DIFFUSERS_CACHE_DIR = None  # if you want to change the cache dir, change this
+from diffusers.training_utils import EMAModel
+import os
+import sys
+# from utils.modules import get_diffusion_modules
+def load_diffusers_model(
+    pretrained_model_name_or_path: str,
+    v2: bool = False,
+    clip_skip: Optional[int] = None,
+    weight_dtype: torch.dtype = torch.float32,
+) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel,]:
+    # VAE はいらない
+    if v2:
+        tokenizer = CLIPTokenizer.from_pretrained(
+            TOKENIZER_V2_MODEL_NAME,
+            subfolder="tokenizer",
+            torch_dtype=weight_dtype,
+            cache_dir=DIFFUSERS_CACHE_DIR,
+        )
+        text_encoder = CLIPTextModel.from_pretrained(
+            pretrained_model_name_or_path,
+            subfolder="text_encoder",
+            # default is clip skip 2
+            num_hidden_layers=24 - (clip_skip - 1) if clip_skip is not None else 23,
+            torch_dtype=weight_dtype,
+            cache_dir=DIFFUSERS_CACHE_DIR,
+        )
+    else:
+        tokenizer = CLIPTokenizer.from_pretrained(
+            TOKENIZER_V1_MODEL_NAME,
+            subfolder="tokenizer",
+            torch_dtype=weight_dtype,
+            cache_dir=DIFFUSERS_CACHE_DIR,
+        )
+        text_encoder = CLIPTextModel.from_pretrained(
+            pretrained_model_name_or_path,
+            subfolder="text_encoder",
+            num_hidden_layers=12 - (clip_skip - 1) if clip_skip is not None else 12,
+            torch_dtype=weight_dtype,
+            cache_dir=DIFFUSERS_CACHE_DIR,
+        )
+    unet = UNet2DConditionModel.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="unet",
+        torch_dtype=weight_dtype,
+        cache_dir=DIFFUSERS_CACHE_DIR,
+    )
+    vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
+    return tokenizer, text_encoder, unet, vae
+def load_checkpoint_model(
+    checkpoint_path: str,
+    v2: bool = False,
+    clip_skip: Optional[int] = None,
+    weight_dtype: torch.dtype = torch.float32,
+) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel,]:
+    pipe = StableDiffusionPipeline.from_ckpt(
+        checkpoint_path,
+        upcast_attention=True if v2 else False,
+        torch_dtype=weight_dtype,
+        cache_dir=DIFFUSERS_CACHE_DIR,
+    )
+    unet = pipe.unet
+    tokenizer = pipe.tokenizer
+    text_encoder = pipe.text_encoder
+    vae = pipe.vae
+    if clip_skip is not None:
+        if v2:
+            text_encoder.config.num_hidden_layers = 24 - (clip_skip - 1)
+        else:
+            text_encoder.config.num_hidden_layers = 12 - (clip_skip - 1)
+    del pipe
+    return tokenizer, text_encoder, unet, vae
+def load_models(
+    pretrained_model_name_or_path: str,
+    ckpt_path: str,
+    scheduler_name: AVAILABLE_SCHEDULERS,
+    v2: bool = False,
+    v_pred: bool = False,
+    weight_dtype: torch.dtype = torch.float32,
+) -> tuple[CLIPTokenizer, CLIPTextModel, UNet2DConditionModel, SchedulerMixin,]:
+    if pretrained_model_name_or_path.endswith(
+        ".ckpt"
+    ) or pretrained_model_name_or_path.endswith(".safetensors"):
+        tokenizer, text_encoder, unet, vae = load_checkpoint_model(
+            pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype
+        )
+    else:  # diffusers
+        tokenizer, text_encoder, unet, vae = load_diffusers_model(
+            pretrained_model_name_or_path, v2=v2, weight_dtype=weight_dtype
+        )
+    # VAE はいらない
+    scheduler = create_noise_scheduler(
+        scheduler_name,
+        prediction_type="v_prediction" if v_pred else "epsilon",
+    )
+    # trained unet_ema
+    if ckpt_path not in [None, "None"]:
+        ema_unet = EMAModel.from_pretrained(os.path.join(ckpt_path, "unet_ema"), UNet2DConditionModel)
+        ema_unet.copy_to(unet.parameters())
+    return tokenizer, text_encoder, unet, scheduler, vae
+def load_diffusers_model_xl(
+    pretrained_model_name_or_path: str,
+    weight_dtype: torch.dtype = torch.float32,
+) -> tuple[list[CLIPTokenizer], list[SDXL_TEXT_ENCODER_TYPE], UNet2DConditionModel,]:
+    # returns tokenizer, tokenizer_2, text_encoder, text_encoder_2, unet
+    tokenizers = [
+        CLIPTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            subfolder="tokenizer",
+            torch_dtype=weight_dtype,
+            cache_dir=DIFFUSERS_CACHE_DIR,
+        ),
+        CLIPTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            subfolder="tokenizer_2",
+            torch_dtype=weight_dtype,
+            cache_dir=DIFFUSERS_CACHE_DIR,
+            pad_token_id=0,  # same as open clip
+        ),
+    ]
+    text_encoders = [
+        CLIPTextModel.from_pretrained(
+            pretrained_model_name_or_path,
+            subfolder="text_encoder",
+            torch_dtype=weight_dtype,
+            cache_dir=DIFFUSERS_CACHE_DIR,
+        ),
+        CLIPTextModelWithProjection.from_pretrained(
+            pretrained_model_name_or_path,
+            subfolder="text_encoder_2",
+            torch_dtype=weight_dtype,
+            cache_dir=DIFFUSERS_CACHE_DIR,
+        ),
+    ]
+    unet = UNet2DConditionModel.from_pretrained(
+        pretrained_model_name_or_path,
+        subfolder="unet",
+        torch_dtype=weight_dtype,
+        cache_dir=DIFFUSERS_CACHE_DIR,
+    )
+    vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae")
+    return tokenizers, text_encoders, unet, vae
+def load_checkpoint_model_xl(
+    checkpoint_path: str,
+    weight_dtype: torch.dtype = torch.float32,
+) -> tuple[list[CLIPTokenizer], list[SDXL_TEXT_ENCODER_TYPE], UNet2DConditionModel,]:
+    pipe = StableDiffusionXLPipeline.from_single_file(
+        checkpoint_path,
+        torch_dtype=weight_dtype,
+        cache_dir=DIFFUSERS_CACHE_DIR,
+    )
+    unet = pipe.unet
+    tokenizers = [pipe.tokenizer, pipe.tokenizer_2]
+    text_encoders = [pipe.text_encoder, pipe.text_encoder_2]
+    if len(text_encoders) == 2:
+        text_encoders[1].pad_token_id = 0
+    vae = pipe.vae
+    del pipe
+    return tokenizers, text_encoders, unet, vae
+def load_models_xl(
+    pretrained_model_name_or_path: str,
+    scheduler_name: AVAILABLE_SCHEDULERS,
+    weight_dtype: torch.dtype = torch.float32,
+) -> tuple[
+    list[CLIPTokenizer],
+    list[SDXL_TEXT_ENCODER_TYPE],
+    UNet2DConditionModel,
+    SchedulerMixin,
+]:
+    if pretrained_model_name_or_path.endswith(
+        ".ckpt"
+    ) or pretrained_model_name_or_path.endswith(".safetensors"):
+        (
+            tokenizers,
+            text_encoders,
+            unet,
+            vae
+        ) = load_checkpoint_model_xl(pretrained_model_name_or_path, weight_dtype)
+    else:  # diffusers
+        (
+            tokenizers,
+            text_encoders,
+            unet,
+            vae
+        ) = load_diffusers_model_xl(pretrained_model_name_or_path, weight_dtype)
+    scheduler = create_noise_scheduler(scheduler_name)
+    return tokenizers, text_encoders, unet, scheduler, vae
+def create_noise_scheduler(
+    scheduler_name: AVAILABLE_SCHEDULERS = "ddpm",
+    prediction_type: Literal["epsilon", "v_prediction"] = "epsilon",
+) -> SchedulerMixin:
+    name = scheduler_name.lower().replace(" ", "_")
+    if name == "ddim":
+        # https://huggingface.co/docs/diffusers/v0.17.1/en/api/schedulers/ddim
+        scheduler = DDIMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            num_train_timesteps=1000,
+            clip_sample=False,
+            prediction_type=prediction_type,  # これでいいの？
+        )
+    elif name == "ddpm":
+        # https://huggingface.co/docs/diffusers/v0.17.1/en/api/schedulers/ddpm
+        scheduler = DDPMScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            num_train_timesteps=1000,
+            clip_sample=False,
+            prediction_type=prediction_type,
+        )
+    elif name == "lms":
+        # https://huggingface.co/docs/diffusers/v0.17.1/en/api/schedulers/lms_discrete
+        scheduler = LMSDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            num_train_timesteps=1000,
+            prediction_type=prediction_type,
+        )
+    elif name == "euler_a":
+        # https://huggingface.co/docs/diffusers/v0.17.1/en/api/schedulers/euler_ancestral
+        scheduler = EulerAncestralDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            num_train_timesteps=1000,
+            prediction_type=prediction_type,
+        )
+    else:
+        raise ValueError(f"Unknown scheduler name: {name}")
+    return scheduler

utils/prompt_util.py ADDED Viewed

	@@ -0,0 +1,174 @@

+from typing import Literal, Optional, Union, List
+import yaml
+from pathlib import Path
+from pydantic import BaseModel, root_validator
+import torch
+import copy
+ACTION_TYPES = Literal[
+    "erase",
+    "enhance",
+]
+# XL は二種類必要なので
+class PromptEmbedsXL:
+    text_embeds: torch.FloatTensor
+    pooled_embeds: torch.FloatTensor
+    def __init__(self, *args) -> None:
+        self.text_embeds = args[0]
+        self.pooled_embeds = args[1]
+# SDv1.x, SDv2.x は FloatTensor、XL は PromptEmbedsXL
+PROMPT_EMBEDDING = Union[torch.FloatTensor, PromptEmbedsXL]
+class PromptEmbedsCache:  # 使いまわしたいので
+    prompts: dict[str, PROMPT_EMBEDDING] = {}
+    def __setitem__(self, __name: str, __value: PROMPT_EMBEDDING) -> None:
+        self.prompts[__name] = __value
+    def __getitem__(self, __name: str) -> Optional[PROMPT_EMBEDDING]:
+        if __name in self.prompts:
+            return self.prompts[__name]
+        else:
+            return None
+class PromptSettings(BaseModel):  # yaml のやつ
+    target: str
+    positive: str = None   # if None, target will be used
+    unconditional: str = ""  # default is ""
+    neutral: str = None  # if None, unconditional will be used
+    action: ACTION_TYPES = "erase"  # default is "erase"
+    guidance_scale: float = 1.0  # default is 1.0
+    resolution: int = 512  # default is 512
+    dynamic_resolution: bool = False  # default is False
+    batch_size: int = 1  # default is 1
+    dynamic_crops: bool = False  # default is False. only used when model is XL
+    @root_validator(pre=True)
+    def fill_prompts(cls, values):
+        keys = values.keys()
+        if "target" not in keys:
+            raise ValueError("target must be specified")
+        if "positive" not in keys:
+            values["positive"] = values["target"]
+        if "unconditional" not in keys:
+            values["unconditional"] = ""
+        if "neutral" not in keys:
+            values["neutral"] = values["unconditional"]
+        return values
+class PromptEmbedsPair:
+    target: PROMPT_EMBEDDING  # not want to generate the concept
+    positive: PROMPT_EMBEDDING  # generate the concept
+    unconditional: PROMPT_EMBEDDING  # uncondition (default should be empty)
+    neutral: PROMPT_EMBEDDING  # base condition (default should be empty)
+    guidance_scale: float
+    resolution: int
+    dynamic_resolution: bool
+    batch_size: int
+    dynamic_crops: bool
+    loss_fn: torch.nn.Module
+    action: ACTION_TYPES
+    def __init__(
+        self,
+        loss_fn: torch.nn.Module,
+        target: PROMPT_EMBEDDING,
+        positive: PROMPT_EMBEDDING,
+        unconditional: PROMPT_EMBEDDING,
+        neutral: PROMPT_EMBEDDING,
+        settings: PromptSettings,
+    ) -> None:
+        self.loss_fn = loss_fn
+        self.target = target
+        self.positive = positive
+        self.unconditional = unconditional
+        self.neutral = neutral
+        self.guidance_scale = settings.guidance_scale
+        self.resolution = settings.resolution
+        self.dynamic_resolution = settings.dynamic_resolution
+        self.batch_size = settings.batch_size
+        self.dynamic_crops = settings.dynamic_crops
+        self.action = settings.action
+    def _erase(
+        self,
+        target_latents: torch.FloatTensor,  # "van gogh"
+        positive_latents: torch.FloatTensor,  # "van gogh"
+        unconditional_latents: torch.FloatTensor,  # ""
+        neutral_latents: torch.FloatTensor,  # ""
+    ) -> torch.FloatTensor:
+        """Target latents are going not to have the positive concept."""
+        return self.loss_fn(
+            target_latents,
+            neutral_latents
+            - self.guidance_scale * (positive_latents - unconditional_latents)
+        )
+    def _enhance(
+        self,
+        target_latents: torch.FloatTensor,  # "van gogh"
+        positive_latents: torch.FloatTensor,  # "van gogh"
+        unconditional_latents: torch.FloatTensor,  # ""
+        neutral_latents: torch.FloatTensor,  # ""
+    ):
+        """Target latents are going to have the positive concept."""
+        return self.loss_fn(
+            target_latents,
+            neutral_latents
+            + self.guidance_scale * (positive_latents - unconditional_latents)
+        )
+    def loss(
+        self,
+        **kwargs,
+    ):
+        if self.action == "erase":
+            return self._erase(**kwargs)
+        elif self.action == "enhance":
+            return self._enhance(**kwargs)
+        else:
+            raise ValueError("action must be erase or enhance")
+def load_prompts_from_yaml(path, attributes = []):
+    with open(path, "r") as f:
+        prompts = yaml.safe_load(f)
+    print(prompts)
+    if len(prompts) == 0:
+        raise ValueError("prompts file is empty")
+    if len(attributes)!=0:
+        newprompts = []
+        for i in range(len(prompts)):
+            for att in attributes:
+                copy_ = copy.deepcopy(prompts[i])
+                copy_['target'] = att + ' ' + copy_['target']
+                copy_['positive'] = att + ' ' + copy_['positive']
+                copy_['neutral'] = att + ' ' + copy_['neutral']
+                copy_['unconditional'] = att + ' ' + copy_['unconditional']
+                newprompts.append(copy_)
+    else:
+        newprompts = copy.deepcopy(prompts)
+    print(newprompts)
+    print(len(prompts), len(newprompts))
+    prompt_settings = [PromptSettings(**prompt) for prompt in newprompts]
+    return prompt_settings

utils/train_util.py ADDED Viewed

	@@ -0,0 +1,526 @@

+from typing import Optional, Union
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, BertModel, BertTokenizer
+from diffusers import UNet2DConditionModel, SchedulerMixin
+from diffusers.image_processor import VaeImageProcessor
+import sys
+import os
+# sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
+# from imagesliders.model_util import SDXL_TEXT_ENCODER_TYPE
+from diffusers.utils.torch_utils import randn_tensor
+from transformers import CLIPTextModel, CLIPTokenizer, CLIPTextModelWithProjection
+SDXL_TEXT_ENCODER_TYPE = Union[CLIPTextModel, CLIPTextModelWithProjection]
+from tqdm import tqdm
+UNET_IN_CHANNELS = 4  # Stable Diffusion  in_channels
+VAE_SCALE_FACTOR = 8  # 2 ** (len(vae.config.block_out_channels) - 1) = 8
+UNET_ATTENTION_TIME_EMBED_DIM = 256  # XL
+TEXT_ENCODER_2_PROJECTION_DIM = 1280
+UNET_PROJECTION_CLASS_EMBEDDING_INPUT_DIM = 2816
+def get_random_noise(
+    batch_size: int, height: int, width: int, generator: torch.Generator = None
+) -> torch.Tensor:
+    return torch.randn(
+        (
+            batch_size,
+            UNET_IN_CHANNELS,
+            height // VAE_SCALE_FACTOR,
+            width // VAE_SCALE_FACTOR,
+        ),
+        generator=generator,
+        device="cpu",
+    )
+def apply_noise_offset(latents: torch.FloatTensor, noise_offset: float):
+    latents = latents + noise_offset * torch.randn(
+        (latents.shape[0], latents.shape[1], 1, 1), device=latents.device
+    )
+    return latents
+def get_initial_latents(
+    scheduler: SchedulerMixin,
+    n_imgs: int,
+    height: int,
+    width: int,
+    n_prompts: int,
+    generator=None,
+) -> torch.Tensor:
+    noise = get_random_noise(n_imgs, height, width, generator=generator).repeat(
+        n_prompts, 1, 1, 1
+    )
+    latents = noise * scheduler.init_noise_sigma
+    return latents
+def text_tokenize(
+    tokenizer,  # 普通ならひとつ、XLならふたつ！
+    prompts,
+):
+    return tokenizer(
+        prompts,
+        padding="max_length",
+        max_length=tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+def text_encode(text_encoder , tokens):
+    tokens = tokens.to(text_encoder.device)
+    if isinstance(text_encoder, BertModel):
+        embed = text_encoder(**tokens, return_dict=False)[0]
+    elif isinstance(text_encoder, CLIPTextModel):
+        # embed = text_encoder(**tokens, return_dict=False)[0]
+        embed = text_encoder(tokens.input_ids, return_dict=False)[0]
+    else:
+        raise ValueError("text_encoder must be BertModel or CLIPTextModel")
+    return embed
+def encode_prompts(
+    tokenizer,
+    text_encoder,
+    prompts: list[str],
+):
+    # print(f"prompts: {prompts}")
+    text_tokens = text_tokenize(tokenizer, prompts)
+    # print(f"text_tokens: {text_tokens}")
+    text_embeddings = text_encode(text_encoder, text_tokens)
+    # print(f"text_embeddings: {text_embeddings}")
+    return text_embeddings
+def prompt_replace(original, key="{prompt}", prompt=""):
+    if key not in original:
+        return original
+    if isinstance(prompt, list):
+        ret =[]
+        for p in prompt:
+            p = p.replace(".", "")
+            r = original.replace(key, p)
+            r = r.capitalize()
+            ret.append(r)
+    else:
+        prompt = prompt.replace(".", "")
+        ret = original.replace(key, prompt)
+        ret = ret.capitalize()
+    return ret
+def text_encode_xl(
+    text_encoder: SDXL_TEXT_ENCODER_TYPE,
+    tokens: torch.FloatTensor,
+    num_images_per_prompt: int = 1,
+):
+    prompt_embeds = text_encoder(
+        tokens.to(text_encoder.device), output_hidden_states=True
+    )
+    pooled_prompt_embeds = prompt_embeds[0]
+    prompt_embeds = prompt_embeds.hidden_states[-2]  # always penultimate layer
+    bs_embed, seq_len, _ = prompt_embeds.shape
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds, pooled_prompt_embeds
+def encode_prompts_xl(
+    tokenizers: list[CLIPTokenizer],
+    text_encoders: list[SDXL_TEXT_ENCODER_TYPE],
+    prompts: list[str],
+    num_images_per_prompt: int = 1,
+) -> tuple[torch.FloatTensor, torch.FloatTensor]:
+    # text_encoder and text_encoder_2's penuultimate layer's output
+    text_embeds_list = []
+    pooled_text_embeds = None  # always text_encoder_2's pool
+    for tokenizer, text_encoder in zip(tokenizers, text_encoders):
+        text_tokens_input_ids = text_tokenize(tokenizer, prompts)
+        text_embeds, pooled_text_embeds = text_encode_xl(
+            text_encoder, text_tokens_input_ids, num_images_per_prompt
+        )
+        text_embeds_list.append(text_embeds)
+    bs_embed = pooled_text_embeds.shape[0]
+    pooled_text_embeds = pooled_text_embeds.repeat(1, num_images_per_prompt).view(
+        bs_embed * num_images_per_prompt, -1
+    )
+    return torch.concat(text_embeds_list, dim=-1), pooled_text_embeds
+def concat_embeddings(
+    unconditional: torch.FloatTensor,
+    conditional: torch.FloatTensor,
+    n_imgs: int,
+):
+    if conditional.shape[0] == n_imgs and unconditional.shape[0] == 1:
+        return torch.cat([unconditional.repeat(n_imgs, 1, 1), conditional], dim=0)
+    return torch.cat([unconditional, conditional]).repeat_interleave(n_imgs, dim=0)
+def predict_noise(
+    unet: UNet2DConditionModel,
+    scheduler: SchedulerMixin,
+    timestep: int,
+    latents: torch.FloatTensor,
+    text_embeddings: torch.FloatTensor,  # uncond な text embed と cond な text embed を結合したもの
+    guidance_scale=7.5,
+) -> torch.FloatTensor:
+    # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+    latent_model_input = torch.cat([latents] * 2)
+    latent_model_input = scheduler.scale_model_input(latent_model_input, timestep)
+    # batch_size = latents.shape[0]
+    # text_embeddings = text_embeddings.repeat_interleave(batch_size, dim=0)
+    # predict the noise residual
+    noise_pred = unet(
+        latent_model_input,
+        timestep,
+        encoder_hidden_states=text_embeddings,
+    ).sample
+    # perform guidance
+    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+    guided_target = noise_pred_uncond + guidance_scale * (
+        noise_pred_text - noise_pred_uncond
+    )
+    return guided_target
+@torch.no_grad()
+def diffusion(
+    unet: UNet2DConditionModel,
+    scheduler: SchedulerMixin,
+    latents: torch.FloatTensor,
+    text_embeddings: torch.FloatTensor,
+    total_timesteps: int = 1000,
+    start_timesteps=0,
+    **kwargs,
+):
+    # latents_steps = []
+    for timestep in scheduler.timesteps[start_timesteps:total_timesteps]:
+        noise_pred = predict_noise(
+            unet, scheduler, timestep, latents, text_embeddings, **kwargs
+        )
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = scheduler.step(noise_pred, timestep, latents).prev_sample
+    # return latents_steps
+    return latents
+@torch.no_grad()
+def get_noisy_image(
+    img,
+    vae,
+    generator,
+    unet: UNet2DConditionModel,
+    scheduler: SchedulerMixin,
+    total_timesteps: int = 1000,
+    start_timesteps=0,
+    **kwargs,
+):
+    # latents_steps = []
+    vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
+    image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
+    image = img
+    # im_orig = image
+    device = vae.device
+    image = image_processor.preprocess(image).to(device)
+    init_latents = vae.encode(image).latent_dist.sample(None)
+    init_latents = vae.config.scaling_factor * init_latents
+    init_latents = torch.cat([init_latents], dim=0)
+    shape = init_latents.shape
+    noise = randn_tensor(shape, generator=generator, device=device)
+    time_ = total_timesteps
+    timestep = scheduler.timesteps[time_:time_+1]
+    # get latents
+    noised_latents = scheduler.add_noise(init_latents, noise, timestep)
+    return noised_latents, noise, init_latents
+def subtract_noise(
+        latent: torch.FloatTensor,
+        noise: torch.FloatTensor,
+        timesteps: torch.IntTensor,
+        scheduler: SchedulerMixin,
+) -> torch.FloatTensor:
+    # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+    # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+    # for the subsequent add_noise calls
+    scheduler.alphas_cumprod = scheduler.alphas_cumprod.to(device=latent.device)
+    alphas_cumprod = scheduler.alphas_cumprod.to(dtype=latent.dtype)
+    timesteps = timesteps.to(latent.device)
+    sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+    sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+    while len(sqrt_alpha_prod.shape) < len(latent.shape):
+        sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+    sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+    sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+    while len(sqrt_one_minus_alpha_prod.shape) < len(latent.shape):
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+    denoised_latent =  (latent - sqrt_one_minus_alpha_prod * noise) / sqrt_alpha_prod
+    return denoised_latent
+def get_denoised_image(
+        latents: torch.FloatTensor,
+        noise_pred: torch.FloatTensor,
+        timestep: int,
+        # total_timesteps: int,
+        scheduler: SchedulerMixin,
+        vae: VaeImageProcessor,
+):
+    denoised_latents = subtract_noise(latents, noise_pred, timestep, scheduler)
+    denoised_latents = denoised_latents / vae.config.scaling_factor # 0.18215
+    denoised_img = vae.decode(denoised_latents).sample
+    # denoised_img = denoised_img.clamp(-1,1)
+    return denoised_img
+def rescale_noise_cfg(
+    noise_cfg: torch.FloatTensor, noise_pred_text, guidance_rescale=0.0
+):
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True
+    )
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = (
+        guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    )
+    return noise_cfg
+def predict_noise_xl(
+    unet: UNet2DConditionModel,
+    scheduler: SchedulerMixin,
+    timestep: int,
+    latents: torch.FloatTensor,
+    text_embeddings: torch.FloatTensor,  # uncond な text embed と cond な text embed を結合したもの
+    add_text_embeddings: torch.FloatTensor,  # pooled なやつ
+    add_time_ids: torch.FloatTensor,
+    guidance_scale=7.5,
+    guidance_rescale=0.7,
+) -> torch.FloatTensor:
+    # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+    latent_model_input = torch.cat([latents] * 2)
+    latent_model_input = scheduler.scale_model_input(latent_model_input, timestep)
+    added_cond_kwargs = {
+        "text_embeds": add_text_embeddings,
+        "time_ids": add_time_ids,
+    }
+    # predict the noise residual
+    noise_pred = unet(
+        latent_model_input,
+        timestep,
+        encoder_hidden_states=text_embeddings,
+        added_cond_kwargs=added_cond_kwargs,
+    ).sample
+    # perform guidance
+    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+    guided_target = noise_pred_uncond + guidance_scale * (
+        noise_pred_text - noise_pred_uncond
+    )
+    noise_pred = rescale_noise_cfg(
+        noise_pred, noise_pred_text, guidance_rescale=guidance_rescale
+    )
+    return guided_target
+@torch.no_grad()
+def diffusion_xl(
+    unet: UNet2DConditionModel,
+    scheduler: SchedulerMixin,
+    latents: torch.FloatTensor,
+    text_embeddings: tuple[torch.FloatTensor, torch.FloatTensor],
+    add_text_embeddings: torch.FloatTensor,
+    add_time_ids: torch.FloatTensor,
+    guidance_scale: float = 1.0,
+    total_timesteps: int = 1000,
+    start_timesteps=0,
+):
+    # latents_steps = []
+    for timestep in tqdm(scheduler.timesteps[start_timesteps:total_timesteps]):
+        noise_pred = predict_noise_xl(
+            unet,
+            scheduler,
+            timestep,
+            latents,
+            text_embeddings,
+            add_text_embeddings,
+            add_time_ids,
+            guidance_scale=guidance_scale,
+            guidance_rescale=0.7,
+        )
+        # compute the previous noisy sample x_t -> x_t-1
+        latents = scheduler.step(noise_pred, timestep, latents).prev_sample
+    # return latents_steps
+    return latents
+# for XL
+def get_add_time_ids(
+    height: int,
+    width: int,
+    dynamic_crops: bool = False,
+    dtype: torch.dtype = torch.float32,
+):
+    if dynamic_crops:
+        # random float scale between 1 and 3
+        random_scale = torch.rand(1).item() * 2 + 1
+        original_size = (int(height * random_scale), int(width * random_scale))
+        # random position
+        crops_coords_top_left = (
+            torch.randint(0, original_size[0] - height, (1,)).item(),
+            torch.randint(0, original_size[1] - width, (1,)).item(),
+        )
+        target_size = (height, width)
+    else:
+        original_size = (height, width)
+        crops_coords_top_left = (0, 0)
+        target_size = (height, width)
+    # this is expected as 6
+    add_time_ids = list(original_size + crops_coords_top_left + target_size)
+    # this is expected as 2816
+    passed_add_embed_dim = (
+        UNET_ATTENTION_TIME_EMBED_DIM * len(add_time_ids)  # 256 * 6
+        + TEXT_ENCODER_2_PROJECTION_DIM  # + 1280
+    )
+    if passed_add_embed_dim != UNET_PROJECTION_CLASS_EMBEDDING_INPUT_DIM:
+        raise ValueError(
+            f"Model expects an added time embedding vector of length {UNET_PROJECTION_CLASS_EMBEDDING_INPUT_DIM}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+        )
+    add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+    return add_time_ids
+def get_optimizer(name: str):
+    name = name.lower()
+    if name.startswith("dadapt"):
+        import dadaptation
+        if name == "dadaptadam":
+            return dadaptation.DAdaptAdam
+        elif name == "dadaptlion":
+            return dadaptation.DAdaptLion
+        else:
+            raise ValueError("DAdapt optimizer must be dadaptadam or dadaptlion")
+    elif name.endswith("8bit"):
+        import bitsandbytes as bnb
+        if name == "adam8bit":
+            return bnb.optim.Adam8bit
+        elif name == "lion8bit":
+            return bnb.optim.Lion8bit
+        else:
+            raise ValueError("8bit optimizer must be adam8bit or lion8bit")
+    else:
+        if name == "adam":
+            return torch.optim.Adam
+        elif name == "adamw":
+            return torch.optim.AdamW
+        elif name == "lion":
+            from lion_pytorch import Lion
+            return Lion
+        elif name == "prodigy":
+            import prodigyopt
+            return prodigyopt.Prodigy
+        else:
+            raise ValueError("Optimizer must be adam, adamw, lion or Prodigy")
+def get_lr_scheduler(
+    name: Optional[str],
+    optimizer: torch.optim.Optimizer,
+    max_iterations: Optional[int],
+    lr_min: Optional[float],
+    **kwargs,
+):
+    if name == "cosine":
+        return torch.optim.lr_scheduler.CosineAnnealingLR(
+            optimizer, T_max=max_iterations, eta_min=lr_min, **kwargs
+        )
+    elif name == "cosine_with_restarts":
+        return torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
+            optimizer, T_0=max_iterations // 10, T_mult=2, eta_min=lr_min, **kwargs
+        )
+    elif name == "step":
+        return torch.optim.lr_scheduler.StepLR(
+            optimizer, step_size=max_iterations // 100, gamma=0.999, **kwargs
+        )
+    elif name == "constant":
+        return torch.optim.lr_scheduler.ConstantLR(optimizer, factor=1, **kwargs)
+    elif name == "linear":
+        return torch.optim.lr_scheduler.LinearLR(
+            optimizer, factor=0.5, total_iters=max_iterations // 100, **kwargs
+        )
+    else:
+        raise ValueError(
+            "Scheduler must be cosine, cosine_with_restarts, step, linear or constant"
+        )
+def get_random_resolution_in_bucket(bucket_resolution: int = 512) -> tuple[int, int]:
+    max_resolution = bucket_resolution
+    min_resolution = bucket_resolution // 2
+    step = 64
+    min_step = min_resolution // step
+    max_step = max_resolution // step
+    height = torch.randint(min_step, max_step, (1,)).item() * step
+    width = torch.randint(min_step, max_step, (1,)).item() * step
+    return height, width